spaCy/spacy/language.py

from typing import Iterator, Optional, Any, Dict, Callable, Iterable, TypeVar
from typing import Union, List, Pattern, overload
from typing import Tuple
from dataclasses import dataclass
import random
import itertools
import functools
from contextlib import contextmanager
from copy import deepcopy
from pathlib import Path
import warnings
from thinc.api import get_current_ops, Config, Optimizer
import srsly
import multiprocessing as mp
from itertools import chain, cycle
from timeit import default_timer as timer
import traceback

from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer
from .util import registry, SimpleFrozenList, _pipe, raise_error
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .util import warn_if_jupyter_cupy
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .tokenizer import Tokenizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
from .schemas import ConfigSchemaPretrain, validate_init_settings
from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups


# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"


class BaseDefaults:
    """Language data defaults, available via Language.Defaults. Can be
    overwritten by language subclasses by defining their own subclasses of
    Language.Defaults.
    """

    config: Config = Config(section_order=CONFIG_SECTION_ORDER)
    tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
    prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
    suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
    infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
    token_match: Optional[Pattern] = None
    url_match: Optional[Pattern] = URL_MATCH
    syntax_iterators: Dict[str, Callable] = {}
    lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
    stop_words = set()
    writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}


@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    """Registered function to create a tokenizer. Returns a factory that takes
    the nlp object and returns a Tokenizer instance using the language detaults.
    """

    def tokenizer_factory(nlp: "Language") -> Tokenizer:
        prefixes = nlp.Defaults.prefixes
        suffixes = nlp.Defaults.suffixes
        infixes = nlp.Defaults.infixes
        prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
        suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
        infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
        return Tokenizer(
            nlp.vocab,
            rules=nlp.Defaults.tokenizer_exceptions,
            prefix_search=prefix_search,
            suffix_search=suffix_search,
            infix_finditer=infix_finditer,
            token_match=nlp.Defaults.token_match,
            url_match=nlp.Defaults.url_match,
        )

    return tokenizer_factory


@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
    lookups = load_lookups(lang=lang, tables=tables)
    return lookups


class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.

    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
    lang (str): Two-letter language ID, i.e. ISO code.

    DOCS: https://spacy.io/api/language
    """

    Defaults = BaseDefaults
    lang: str = None
    default_config = DEFAULT_CONFIG

    factories = SimpleFrozenDict(error=Errors.E957)
    _factory_meta: Dict[str, "FactoryMeta"] = {}  # meta by factory

    def __init__(
        self,
        vocab: Union[Vocab, bool] = True,
        *,
        max_length: int = 10 ** 6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        batch_size: int = 1000,
        **kwargs,
    ) -> None:
        """Initialise a Language object.

        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
        meta (dict): Custom meta data for the Language class. Is written to by
            models to add model meta data.
        max_length (int): Maximum number of characters in a single text. The
            current models may run out memory on extremely long texts, due to
            large internal allocations. You should segment these texts into
            meaningful units, e.g. paragraphs, subsections etc, before passing
            them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
            a rule of thumb, if all pipeline components are enabled, spaCy's
            default models currently requires roughly 1GB of temporary memory per
            100,000 characters in one text.
        create_tokenizer (Callable): Function that takes the nlp object and
            returns a tokenizer.
        batch_size (int): Default batch size for pipe and evaluate.

        DOCS: https://spacy.io/api/language#init
        """
        # We're only calling this to import all factories provided via entry
        # points. The factory decorator applied to these functions takes care
        # of the rest.
        util.registry._entry_point_factories.get_all()

        self._config = DEFAULT_CONFIG.merge(self.default_config)
        self._meta = dict(meta)
        self._path = None
        self._optimizer = None
        # Component meta and configs are only needed on the instance
        self._pipe_meta: Dict[str, "FactoryMeta"] = {}  # meta by component
        self._pipe_configs: Dict[str, Config] = {}  # config by component

        if not isinstance(vocab, Vocab) and vocab is not True:
            raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
        if vocab is True:
            vectors_name = meta.get("vectors", {}).get("name")
            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
        else:
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
        self.vocab: Vocab = vocab
        if self.lang is None:
            self.lang = self.vocab.lang
        self._components = []
        self._disabled = set()
        self.max_length = max_length
        # Create the default tokenizer from the default config
        if not create_tokenizer:
            tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
            create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
        self.tokenizer = create_tokenizer(self)
        self.batch_size = batch_size
        self.default_error_handler = raise_error

    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
        cls.default_config["nlp"]["lang"] = cls.lang

    @property
    def path(self):
        return self._path

    @property
    def meta(self) -> Dict[str, Any]:
        """Custom meta data of the language class. If a model is loaded, this
        includes details from the model's meta.json.

        RETURNS (Dict[str, Any]): The meta.

        DOCS: https://spacy.io/api/language#meta
        """
        spacy_version = util.get_minor_version_range(about.__version__)
        if self.vocab.lang:
            self._meta.setdefault("lang", self.vocab.lang)
        else:
            self._meta.setdefault("lang", self.lang)
        self._meta.setdefault("name", "pipeline")
        self._meta.setdefault("version", "0.0.0")
        self._meta.setdefault("spacy_version", spacy_version)
        self._meta.setdefault("description", "")
        self._meta.setdefault("author", "")
        self._meta.setdefault("email", "")
        self._meta.setdefault("url", "")
        self._meta.setdefault("license", "")
        self._meta.setdefault("spacy_git_version", GIT_VERSION)
        self._meta["vectors"] = {
            "width": self.vocab.vectors_length,
            "vectors": len(self.vocab.vectors),
            "keys": self.vocab.vectors.n_keys,
            "name": self.vocab.vectors.name,
        }
        self._meta["labels"] = dict(self.pipe_labels)
        # TODO: Adding this back to prevent breaking people's code etc., but
        # we should consider removing it
        self._meta["pipeline"] = list(self.pipe_names)
        self._meta["components"] = list(self.component_names)
        self._meta["disabled"] = list(self.disabled)
        return self._meta

    @meta.setter
    def meta(self, value: Dict[str, Any]) -> None:
        self._meta = value

    @property
    def config(self) -> Config:
        """Trainable config for the current language instance. Includes the
        current pipeline components, as well as default training config.

        RETURNS (thinc.api.Config): The config.

        DOCS: https://spacy.io/api/language#config
        """
        self._config.setdefault("nlp", {})
        self._config.setdefault("training", {})
        self._config["nlp"]["lang"] = self.lang
        # We're storing the filled config for each pipeline component and so
        # we can populate the config again later
        pipeline = {}
        score_weights = []
        for pipe_name in self.component_names:
            pipe_meta = self.get_pipe_meta(pipe_name)
            pipe_config = self.get_pipe_config(pipe_name)
            pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
            if pipe_meta.default_score_weights:
                score_weights.append(pipe_meta.default_score_weights)
        self._config["nlp"]["pipeline"] = list(self.component_names)
        self._config["nlp"]["disabled"] = list(self.disabled)
        self._config["components"] = pipeline
        # We're merging the existing score weights back into the combined
        # weights to make sure we're preserving custom settings in the config
        # but also reflect updates (e.g. new components added)
        prev_weights = self._config["training"].get("score_weights", {})
        combined_score_weights = combine_score_weights(score_weights, prev_weights)
        self._config["training"]["score_weights"] = combined_score_weights
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
        return self._config

    @config.setter
    def config(self, value: Config) -> None:
        self._config = value

    @property
    def disabled(self) -> List[str]:
        """Get the names of all disabled components.

        RETURNS (List[str]): The disabled components.
        """
        # Make sure the disabled components are returned in the order they
        # appear in the pipeline (which isn't guaranteed by the set)
        names = [name for name, _ in self._components if name in self._disabled]
        return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))

    @property
    def factory_names(self) -> List[str]:
        """Get names of all available factories.

        RETURNS (List[str]): The factory names.
        """
        names = list(self.factories.keys())
        return SimpleFrozenList(names)

    @property
    def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
        """Get all (name, component) tuples in the pipeline, including the
        currently disabled components.
        """
        return SimpleFrozenList(
            self._components, error=Errors.E926.format(attr="components")
        )

    @property
    def component_names(self) -> List[str]:
        """Get the names of the available pipeline components. Includes all
        active and inactive pipeline components.

        RETURNS (List[str]): List of component name strings, in order.
        """
        names = [pipe_name for pipe_name, _ in self._components]
        return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))

    @property
    def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
        """The processing pipeline consisting of (name, component) tuples. The
        components are called on the Doc in order as it passes through the
        pipeline.

        RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
        """
        pipes = [(n, p) for n, p in self._components if n not in self._disabled]
        return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))

    @property
    def pipe_names(self) -> List[str]:
        """Get names of available active pipeline components.

        RETURNS (List[str]): List of component name strings, in order.
        """
        names = [pipe_name for pipe_name, _ in self.pipeline]
        return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))

    @property
    def pipe_factories(self) -> Dict[str, str]:
        """Get the component factories for the available pipeline components.

        RETURNS (Dict[str, str]): Factory names, keyed by component names.
        """
        factories = {}
        for pipe_name, pipe in self._components:
            factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
        return SimpleFrozenDict(factories)

    @property
    def pipe_labels(self) -> Dict[str, List[str]]:
        """Get the labels set by the pipeline components, if available (if
        the component exposes a labels property).

        RETURNS (Dict[str, List[str]]): Labels keyed by component name.
        """
        labels = {}
        for name, pipe in self._components:
            if hasattr(pipe, "labels"):
                labels[name] = list(pipe.labels)
        return SimpleFrozenDict(labels)

    @classmethod
    def has_factory(cls, name: str) -> bool:
        """RETURNS (bool): Whether a factory of that name is registered."""
        internal_name = cls.get_factory_name(name)
        return name in registry.factories or internal_name in registry.factories

    @classmethod
    def get_factory_name(cls, name: str) -> str:
        """Get the internal factory name based on the language subclass.

        name (str): The factory name.
        RETURNS (str): The internal factory name.
        """
        if cls.lang is None:
            return name
        return f"{cls.lang}.{name}"

    @classmethod
    def get_factory_meta(cls, name: str) -> "FactoryMeta":
        """Get the meta information for a given factory name.

        name (str): The component factory name.
        RETURNS (FactoryMeta): The meta for the given factory name.
        """
        internal_name = cls.get_factory_name(name)
        if internal_name in cls._factory_meta:
            return cls._factory_meta[internal_name]
        if name in cls._factory_meta:
            return cls._factory_meta[name]
        raise ValueError(Errors.E967.format(meta="factory", name=name))

    @classmethod
    def set_factory_meta(cls, name: str, value: "FactoryMeta") -> None:
        """Set the meta information for a given factory name.

        name (str): The component factory name.
        value (FactoryMeta): The meta to set.
        """
        cls._factory_meta[cls.get_factory_name(name)] = value

    def get_pipe_meta(self, name: str) -> "FactoryMeta":
        """Get the meta information for a given component name.

        name (str): The component name.
        RETURNS (FactoryMeta): The meta for the given component name.
        """
        if name not in self._pipe_meta:
            raise ValueError(Errors.E967.format(meta="component", name=name))
        return self._pipe_meta[name]

    def get_pipe_config(self, name: str) -> Config:
        """Get the config used to create a pipeline component.

        name (str): The component name.
        RETURNS (Config): The config used to create the pipeline component.
        """
        if name not in self._pipe_configs:
            raise ValueError(Errors.E960.format(name=name))
        pipe_config = self._pipe_configs[name]
        return pipe_config

    @classmethod
    def factory(
        cls,
        name: str,
        *,
        default_config: Dict[str, Any] = SimpleFrozenDict(),
        assigns: Iterable[str] = SimpleFrozenList(),
        requires: Iterable[str] = SimpleFrozenList(),
        retokenizes: bool = False,
        default_score_weights: Dict[str, float] = SimpleFrozenDict(),
        func: Optional[Callable] = None,
    ) -> Callable:
        """Register a new pipeline component factory. Can be used as a decorator
        on a function or classmethod, or called as a function with the factory
        provided as the func keyword argument. To create a component and add
        it to the pipeline, you can use nlp.add_pipe(name).

        name (str): The name of the component factory.
        default_config (Dict[str, Any]): Default configuration, describing the
            default values of the factory arguments.
        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        requires (Iterable[str]): Doc/Token attributes required by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        default_score_weights (Dict[str, float]): The scores to report during
            training, and their default weight towards the final score used to
            select the best model. Weights should sum to 1.0 per component and
            will be combined and normalized for the whole pipeline. If None,
            the score won't be shown in the logs or be weighted.
        func (Optional[Callable]): Factory function if not used as a decorator.

        DOCS: https://spacy.io/api/language#factory
        """
        if not isinstance(name, str):
            raise ValueError(Errors.E963.format(decorator="factory"))
        if not isinstance(default_config, dict):
            err = Errors.E962.format(
                style="default config", name=name, cfg_type=type(default_config)
            )
            raise ValueError(err)

        def add_factory(factory_func: Callable) -> Callable:
            internal_name = cls.get_factory_name(name)
            if internal_name in registry.factories:
                # We only check for the internal name here – it's okay if it's a
                # subclass and the base class has a factory of the same name. We
                # also only raise if the function is different to prevent raising
                # if module is reloaded.
                existing_func = registry.factories.get(internal_name)
                if not util.is_same_func(factory_func, existing_func):
                    err = Errors.E004.format(
                        name=name, func=existing_func, new_func=factory_func
                    )
                    raise ValueError(err)

            arg_names = util.get_arg_names(factory_func)
            if "nlp" not in arg_names or "name" not in arg_names:
                raise ValueError(Errors.E964.format(name=name))
            # Officially register the factory so we can later call
            # registry.resolve and refer to it in the config as
            # @factories = "spacy.Language.xyz". We use the class name here so
            # different classes can have different factories.
            registry.factories.register(internal_name, func=factory_func)
            factory_meta = FactoryMeta(
                factory=name,
                default_config=default_config,
                assigns=validate_attrs(assigns),
                requires=validate_attrs(requires),
                scores=list(default_score_weights.keys()),
                default_score_weights=default_score_weights,
                retokenizes=retokenizes,
            )
            cls.set_factory_meta(name, factory_meta)
            # We're overwriting the class attr with a frozen dict to handle
            # backwards-compat (writing to Language.factories directly). This
            # wouldn't work with an instance property and just produce a
            # confusing error – here we can show a custom error
            cls.factories = SimpleFrozenDict(
                registry.factories.get_all(), error=Errors.E957
            )
            return factory_func

        if func is not None:  # Support non-decorator use cases
            return add_factory(func)
        return add_factory

    @classmethod
    def component(
        cls,
        name: Optional[str] = None,
        *,
        assigns: Iterable[str] = SimpleFrozenList(),
        requires: Iterable[str] = SimpleFrozenList(),
        retokenizes: bool = False,
        func: Optional[Callable[[Doc], Doc]] = None,
    ) -> Callable:
        """Register a new pipeline component. Can be used for stateless function
        components that don't require a separate factory. Can be used as a
        decorator on a function or classmethod, or called as a function with the
        factory provided as the func keyword argument. To create a component and
        add it to the pipeline, you can use nlp.add_pipe(name).

        name (str): The name of the component factory.
        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        requires (Iterable[str]): Doc/Token attributes required by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        func (Optional[Callable]): Factory function if not used as a decorator.

        DOCS: https://spacy.io/api/language#component
        """
        if name is not None and not isinstance(name, str):
            raise ValueError(Errors.E963.format(decorator="component"))
        component_name = name if name is not None else util.get_object_name(func)

        def add_component(component_func: Callable[[Doc], Doc]) -> Callable:
            if isinstance(func, type):  # function is a class
                raise ValueError(Errors.E965.format(name=component_name))

            def factory_func(nlp: cls, name: str) -> Callable[[Doc], Doc]:
                return component_func

            internal_name = cls.get_factory_name(name)
            if internal_name in registry.factories:
                # We only check for the internal name here – it's okay if it's a
                # subclass and the base class has a factory of the same name. We
                # also only raise if the function is different to prevent raising
                # if module is reloaded. It's hacky, but we need to check the
                # existing functure for a closure and whether that's identical
                # to the component function (because factory_func created above
                # will always be different, even for the same function)
                existing_func = registry.factories.get(internal_name)
                closure = existing_func.__closure__
                wrapped = [c.cell_contents for c in closure][0] if closure else None
                if util.is_same_func(wrapped, component_func):
                    factory_func = existing_func  # noqa: F811

            cls.factory(
                component_name,
                assigns=assigns,
                requires=requires,
                retokenizes=retokenizes,
                func=factory_func,
            )
            return component_func

        if func is not None:  # Support non-decorator use cases
            return add_component(func)
        return add_component

    def analyze_pipes(
        self,
        *,
        keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
        pretty: bool = False,
    ) -> Optional[Dict[str, Any]]:
        """Analyze the current pipeline components, print a summary of what
        they assign or require and check that all requirements are met.

        keys (List[str]): The meta values to display in the table. Corresponds
            to values in FactoryMeta, defined by @Language.factory decorator.
        pretty (bool): Pretty-print the results.
        RETURNS (dict): The data.
        """
        analysis = analyze_pipes(self, keys=keys)
        if pretty:
            print_pipe_analysis(analysis, keys=keys)
        return analysis

    def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
        """Get a pipeline component for a given component name.

        name (str): Name of pipeline component to get.
        RETURNS (callable): The pipeline component.

        DOCS: https://spacy.io/api/language#get_pipe
        """
        for pipe_name, component in self._components:
            if pipe_name == name:
                return component
        raise KeyError(Errors.E001.format(name=name, opts=self.component_names))

    def create_pipe(
        self,
        factory_name: str,
        name: Optional[str] = None,
        *,
        config: Dict[str, Any] = SimpleFrozenDict(),
        raw_config: Optional[Config] = None,
        validate: bool = True,
    ) -> Callable[[Doc], Doc]:
        """Create a pipeline component. Mostly used internally. To create and
        add a component to the pipeline, you can use nlp.add_pipe.

        factory_name (str): Name of component factory.
        name (Optional[str]): Optional name to assign to component instance.
            Defaults to factory name if not set.
        config (Dict[str, Any]): Config parameters to use for this component.
            Will be merged with default config, if available.
        raw_config (Optional[Config]): Internals: the non-interpolated config.
        validate (bool): Whether to validate the component config against the
            arguments and types expected by the factory.
        RETURNS (Callable[[Doc], Doc]): The pipeline component.

        DOCS: https://spacy.io/api/language#create_pipe
        """
        name = name if name is not None else factory_name
        if not isinstance(config, dict):
            err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
            raise ValueError(err)
        if not srsly.is_json_serializable(config):
            raise ValueError(Errors.E961.format(config=config))
        if not self.has_factory(factory_name):
            err = Errors.E002.format(
                name=factory_name,
                opts=", ".join(self.factory_names),
                method="create_pipe",
                lang=util.get_object_name(self),
                lang_code=self.lang,
            )
            raise ValueError(err)
        pipe_meta = self.get_factory_meta(factory_name)
        # This is unideal, but the alternative would mean you always need to
        # specify the full config settings, which is not really viable.
        if pipe_meta.default_config:
            config = Config(pipe_meta.default_config).merge(config)
        internal_name = self.get_factory_name(factory_name)
        # If the language-specific factory doesn't exist, try again with the
        # not-specific name
        if internal_name not in registry.factories:
            internal_name = factory_name
        # The name allows components to know their pipe name and use it in the
        # losses etc. (even if multiple instances of the same factory are used)
        config = {"nlp": self, "name": name, **config, "@factories": internal_name}
        # We need to create a top-level key because Thinc doesn't allow resolving
        # top-level references to registered functions. Also gives nicer errors.
        cfg = {factory_name: config}
        # We're calling the internal _fill here to avoid constructing the
        # registered functions twice
        resolved = registry.resolve(cfg, validate=validate)
        filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
        filled = Config(filled)
        filled["factory"] = factory_name
        filled.pop("@factories", None)
        # Remove the extra values we added because we don't want to keep passing
        # them around, copying them etc.
        filled.pop("nlp", None)
        filled.pop("name", None)
        # Merge the final filled config with the raw config (including non-
        # interpolated variables)
        if raw_config:
            filled = filled.merge(raw_config)
        self._pipe_configs[name] = filled
        return resolved[factory_name]

    def create_pipe_from_source(
        self, source_name: str, source: "Language", *, name: str
    ) -> Tuple[Callable[[Doc], Doc], str]:
        """Create a pipeline component by copying it from an existing model.

        source_name (str): Name of the component in the source pipeline.
        source (Language): The source nlp object to copy from.
        name (str): Optional alternative name to use in current pipeline.
        RETURNS (Tuple[Callable, str]): The component and its factory name.
        """
        # Check source type
        if not isinstance(source, Language):
            raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
        # Check vectors, with faster checks first
        if (
            self.vocab.vectors.shape != source.vocab.vectors.shape
            or self.vocab.vectors.key2row != source.vocab.vectors.key2row
            or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
        ):
            warnings.warn(Warnings.W113.format(name=source_name))
        if source_name not in source.component_names:
            raise KeyError(
                Errors.E944.format(
                    name=source_name,
                    model=f"{source.meta['lang']}_{source.meta['name']}",
                    opts=", ".join(source.component_names),
                )
            )
        pipe = source.get_pipe(source_name)
        # Make sure the source config is interpolated so we don't end up with
        # orphaned variables in our final config
        source_config = source.config.interpolate()
        pipe_config = util.copy_config(source_config["components"][source_name])
        self._pipe_configs[name] = pipe_config
        if self.vocab.strings != source.vocab.strings:
            for s in source.vocab.strings:
                self.vocab.strings.add(s)
        return pipe, pipe_config["factory"]

    def add_pipe(
        self,
        factory_name: str,
        name: Optional[str] = None,
        *,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
        first: Optional[bool] = None,
        last: Optional[bool] = None,
        source: Optional["Language"] = None,
        config: Dict[str, Any] = SimpleFrozenDict(),
        raw_config: Optional[Config] = None,
        validate: bool = True,
    ) -> Callable[[Doc], Doc]:
        """Add a component to the processing pipeline. Valid components are
        callables that take a `Doc` object, modify it and return it. Only one
        of before/after/first/last can be set. Default behaviour is "last".

        factory_name (str): Name of the component factory.
        name (str): Name of pipeline component. Overwrites existing
            component.name attribute if available. If no name is set and
            the component exposes no name attribute, component.__name__ is
            used. An error is raised if a name already exists in the pipeline.
        before (Union[str, int]): Name or index of the component to insert new
            component directly before.
        after (Union[str, int]): Name or index of the component to insert new
            component directly after.
        first (bool): If True, insert component first in the pipeline.
        last (bool): If True, insert component last in the pipeline.
        source (Language): Optional loaded nlp object to copy the pipeline
            component from.
        config (Dict[str, Any]): Config parameters to use for this component.
            Will be merged with default config, if available.
        raw_config (Optional[Config]): Internals: the non-interpolated config.
        validate (bool): Whether to validate the component config against the
            arguments and types expected by the factory.
        RETURNS (Callable[[Doc], Doc]): The pipeline component.

        DOCS: https://spacy.io/api/language#add_pipe
        """
        if not isinstance(factory_name, str):
            bad_val = repr(factory_name)
            err = Errors.E966.format(component=bad_val, name=name)
            raise ValueError(err)
        name = name if name is not None else factory_name
        if name in self.component_names:
            raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
        if source is not None:
            # We're loading the component from a model. After loading the
            # component, we know its real factory name
            pipe_component, factory_name = self.create_pipe_from_source(
                factory_name, source, name=name
            )
        else:
            if not self.has_factory(factory_name):
                err = Errors.E002.format(
                    name=factory_name,
                    opts=", ".join(self.factory_names),
                    method="add_pipe",
                    lang=util.get_object_name(self),
                    lang_code=self.lang,
                )
            pipe_component = self.create_pipe(
                factory_name,
                name=name,
                config=config,
                raw_config=raw_config,
                validate=validate,
            )
        pipe_index = self._get_pipe_index(before, after, first, last)
        self._pipe_meta[name] = self.get_factory_meta(factory_name)
        self._components.insert(pipe_index, (name, pipe_component))
        return pipe_component

    def _get_pipe_index(
        self,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
        first: Optional[bool] = None,
        last: Optional[bool] = None,
    ) -> int:
        """Determine where to insert a pipeline component based on the before/
        after/first/last values.

        before (str): Name or index of the component to insert directly before.
        after (str): Name or index of component to insert directly after.
        first (bool): If True, insert component first in the pipeline.
        last (bool): If True, insert component last in the pipeline.
        RETURNS (int): The index of the new pipeline component.
        """
        all_args = {"before": before, "after": after, "first": first, "last": last}
        if sum(arg is not None for arg in [before, after, first, last]) >= 2:
            raise ValueError(
                Errors.E006.format(args=all_args, opts=self.component_names)
            )
        if last or not any(value is not None for value in [first, before, after]):
            return len(self._components)
        elif first:
            return 0
        elif isinstance(before, str):
            if before not in self.component_names:
                raise ValueError(
                    Errors.E001.format(name=before, opts=self.component_names)
                )
            return self.component_names.index(before)
        elif isinstance(after, str):
            if after not in self.component_names:
                raise ValueError(
                    Errors.E001.format(name=after, opts=self.component_names)
                )
            return self.component_names.index(after) + 1
        # We're only accepting indices referring to components that exist
        # (can't just do isinstance here because bools are instance of int, too)
        elif type(before) == int:
            if before >= len(self._components) or before < 0:
                err = Errors.E959.format(
                    dir="before", idx=before, opts=self.component_names
                )
                raise ValueError(err)
            return before
        elif type(after) == int:
            if after >= len(self._components) or after < 0:
                err = Errors.E959.format(
                    dir="after", idx=after, opts=self.component_names
                )
                raise ValueError(err)
            return after + 1
        raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))

    def has_pipe(self, name: str) -> bool:
        """Check if a component name is present in the pipeline. Equivalent to
        `name in nlp.pipe_names`.

        name (str): Name of the component.
        RETURNS (bool): Whether a component of the name exists in the pipeline.

        DOCS: https://spacy.io/api/language#has_pipe
        """
        return name in self.pipe_names

    def replace_pipe(
        self,
        name: str,
        factory_name: str,
        *,
        config: Dict[str, Any] = SimpleFrozenDict(),
        validate: bool = True,
    ) -> Callable[[Doc], Doc]:
        """Replace a component in the pipeline.

        name (str): Name of the component to replace.
        factory_name (str): Factory name of replacement component.
        config (Optional[Dict[str, Any]]): Config parameters to use for this
            component. Will be merged with default config, if available.
        validate (bool): Whether to validate the component config against the
            arguments and types expected by the factory.
        RETURNS (Callable[[Doc], Doc]): The new pipeline component.

        DOCS: https://spacy.io/api/language#replace_pipe
        """
        if name not in self.component_names:
            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
        if hasattr(factory_name, "__call__"):
            err = Errors.E968.format(component=repr(factory_name), name=name)
            raise ValueError(err)
        # We need to delegate to Language.add_pipe here instead of just writing
        # to Language.pipeline to make sure the configs are handled correctly
        pipe_index = self.component_names.index(name)
        self.remove_pipe(name)
        if not len(self._components) or pipe_index == len(self._components):
            # we have no components to insert before/after, or we're replacing the last component
            return self.add_pipe(
                factory_name, name=name, config=config, validate=validate
            )
        else:
            return self.add_pipe(
                factory_name,
                name=name,
                before=pipe_index,
                config=config,
                validate=validate,
            )

    def rename_pipe(self, old_name: str, new_name: str) -> None:
        """Rename a pipeline component.

        old_name (str): Name of the component to rename.
        new_name (str): New name of the component.

        DOCS: https://spacy.io/api/language#rename_pipe
        """
        if old_name not in self.component_names:
            raise ValueError(
                Errors.E001.format(name=old_name, opts=self.component_names)
            )
        if new_name in self.component_names:
            raise ValueError(
                Errors.E007.format(name=new_name, opts=self.component_names)
            )
        i = self.component_names.index(old_name)
        self._components[i] = (new_name, self._components[i][1])
        self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
        self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
        # Make sure [initialize] config is adjusted
        if old_name in self._config["initialize"]["components"]:
            init_cfg = self._config["initialize"]["components"].pop(old_name)
            self._config["initialize"]["components"][new_name] = init_cfg

    def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]:
        """Remove a component from the pipeline.

        name (str): Name of the component to remove.
        RETURNS (tuple): A `(name, component)` tuple of the removed component.

        DOCS: https://spacy.io/api/language#remove_pipe
        """
        if name not in self.component_names:
            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
        removed = self._components.pop(self.component_names.index(name))
        # We're only removing the component itself from the metas/configs here
        # because factory may be used for something else
        self._pipe_meta.pop(name)
        self._pipe_configs.pop(name)
        self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
        # Make sure name is removed from the [initialize] config
        if name in self._config["initialize"]["components"]:
            self._config["initialize"]["components"].pop(name)
        # Make sure the name is also removed from the set of disabled components
        if name in self.disabled:
            self._disabled.remove(name)
        return removed

    def disable_pipe(self, name: str) -> None:
        """Disable a pipeline component. The component will still exist on
        the nlp object, but it won't be run as part of the pipeline. Does
        nothing if the component is already disabled.

        name (str): The name of the component to disable.
        """
        if name not in self.component_names:
            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
        self._disabled.add(name)

    def enable_pipe(self, name: str) -> None:
        """Enable a previously disabled pipeline component so it's run as part
        of the pipeline. Does nothing if the component is already enabled.

        name (str): The name of the component to enable.
        """
        if name not in self.component_names:
            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
        if name in self.disabled:
            self._disabled.remove(name)

    def __call__(
        self,
        text: str,
        *,
        disable: Iterable[str] = SimpleFrozenList(),
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
    ) -> Doc:
        """Apply the pipeline to some text. The text can span multiple sentences,
        and can contain arbitrary whitespace. Alignment into the original string
        is preserved.

        text (str): The text to be processed.
        disable (list): Names of the pipeline components to disable.
        component_cfg (Dict[str, dict]): An optional dictionary with extra
            keyword arguments for specific components.
        RETURNS (Doc): A container for accessing the annotations.

        DOCS: https://spacy.io/api/language#call
        """
        doc = self.make_doc(text)
        if component_cfg is None:
            component_cfg = {}
        for name, proc in self.pipeline:
            if name in disable:
                continue
            if not hasattr(proc, "__call__"):
                raise ValueError(Errors.E003.format(component=type(proc), name=name))
            error_handler = self.default_error_handler
            if hasattr(proc, "get_error_handler"):
                error_handler = proc.get_error_handler()
            try:
                doc = proc(doc, **component_cfg.get(name, {}))
            except KeyError as e:
                # This typically happens if a component is not initialized
                raise ValueError(Errors.E109.format(name=name)) from e
            except Exception as e:
                error_handler(name, proc, [doc], e)
            if doc is None:
                raise ValueError(Errors.E005.format(name=name))
        return doc

    def disable_pipes(self, *names) -> "DisabledPipes":
        """Disable one or more pipeline components. If used as a context
        manager, the pipeline will be restored to the initial state at the end
        of the block. Otherwise, a DisabledPipes object is returned, that has
        a `.restore()` method you can use to undo your changes.

        This method has been deprecated since 3.0
        """
        warnings.warn(Warnings.W096, DeprecationWarning)
        if len(names) == 1 and isinstance(names[0], (list, tuple)):
            names = names[0]  # support list of names instead of spread
        return self.select_pipes(disable=names)

    def select_pipes(
        self,
        *,
        disable: Optional[Union[str, Iterable[str]]] = None,
        enable: Optional[Union[str, Iterable[str]]] = None,
    ) -> "DisabledPipes":
        """Disable one or more pipeline components. If used as a context
        manager, the pipeline will be restored to the initial state at the end
        of the block. Otherwise, a DisabledPipes object is returned, that has
        a `.restore()` method you can use to undo your changes.

        disable (str or iterable): The name(s) of the pipes to disable
        enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled

        DOCS: https://spacy.io/api/language#select_pipes
        """
        if enable is None and disable is None:
            raise ValueError(Errors.E991)
        if disable is not None and isinstance(disable, str):
            disable = [disable]
        if enable is not None:
            if isinstance(enable, str):
                enable = [enable]
            to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
            # raise an error if the enable and disable keywords are not consistent
            if disable is not None and disable != to_disable:
                raise ValueError(
                    Errors.E992.format(
                        enable=enable, disable=disable, names=self.pipe_names
                    )
                )
            disable = to_disable
        # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
        # those pipes that were already disabled.
        disable = [d for d in disable if d not in self._disabled]
        return DisabledPipes(self, disable)

    def make_doc(self, text: str) -> Doc:
        """Turn a text into a Doc object.

        text (str): The text to process.
        RETURNS (Doc): The processed doc.
        """
        if len(text) > self.max_length:
            raise ValueError(
                Errors.E088.format(length=len(text), max_length=self.max_length)
            )
        return self.tokenizer(text)

    def update(
        self,
        examples: Iterable[Example],
        _: Optional[Any] = None,
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
        annotates: Iterable[str] = SimpleFrozenList(),
    ):
        """Update the models in the pipeline.

        examples (Iterable[Example]): A batch of examples
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
        sgd (Optimizer): An optimizer.
        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
        annotates (Iterable[str]): Names of components that should set
            annotations on the predicted examples after updating.
        RETURNS (Dict[str, float]): The updated losses dictionary

        DOCS: https://spacy.io/api/language#update
        """
        if _ is not None:
            raise ValueError(Errors.E989)
        if losses is None:
            losses = {}
        if len(examples) == 0:
            return losses
        validate_examples(examples, "Language.update")
        examples = _copy_examples(examples)
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = self.create_optimizer()
            sgd = self._optimizer
        if component_cfg is None:
            component_cfg = {}
        pipe_kwargs = {}
        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
            pipe_kwargs[name] = deepcopy(component_cfg[name])
            component_cfg[name].setdefault("drop", drop)
            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
            if name not in exclude and hasattr(proc, "update"):
                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
            if sgd not in (None, False):
                if (
                    name not in exclude
                    and hasattr(proc, "is_trainable")
                    and proc.is_trainable
                    and proc.model not in (True, False, None)
                ):
                    proc.finish_update(sgd)
            if name in annotates:
                for doc, eg in zip(
                    _pipe(
                        (eg.predicted for eg in examples),
                        proc=proc,
                        name=name,
                        default_error_handler=self.default_error_handler,
                        kwargs=pipe_kwargs[name],
                    ),
                    examples,
                ):
                    eg.predicted = doc
        return losses

    def rehearse(
        self,
        examples: Iterable[Example],
        *,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
    ) -> Dict[str, float]:
        """Make a "rehearsal" update to the models in the pipeline, to prevent
        forgetting. Rehearsal updates run an initial copy of the model over some
        data, and update the model so its current predictions are more like the
        initial ones. This is useful for keeping a pretrained model on-track,
        even if you're updating it with a smaller set of examples.

        examples (Iterable[Example]): A batch of `Example` objects.
        sgd (Optional[Optimizer]): An optimizer.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
        RETURNS (dict): Results from the update.

        EXAMPLE:
            >>> raw_text_batches = minibatch(raw_texts)
            >>> for labelled_batch in minibatch(examples):
            >>>     nlp.update(labelled_batch)
            >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
            >>>     nlp.rehearse(raw_batch)

        DOCS: https://spacy.io/api/language#rehearse
        """
        if len(examples) == 0:
            return
        validate_examples(examples, "Language.rehearse")
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = self.create_optimizer()
            sgd = self._optimizer
        pipes = list(self.pipeline)
        random.shuffle(pipes)
        if component_cfg is None:
            component_cfg = {}
        grads = {}

        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)

        get_grads.learn_rate = sgd.learn_rate
        get_grads.b1 = sgd.b1
        get_grads.b2 = sgd.b2
        for name, proc in pipes:
            if name in exclude or not hasattr(proc, "rehearse"):
                continue
            grads = {}
            proc.rehearse(
                examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
            )
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        return losses

    def begin_training(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
        warnings.warn(Warnings.W089, DeprecationWarning)
        return self.initialize(get_examples, sgd=sgd)

    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
        """Initialize the pipe for training, using data examples if available.

        get_examples (Callable[[], Iterable[Example]]): Optional function that
            returns gold-standard Example objects.
        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
            provided, will be created using the .create_optimizer() method.
        RETURNS (thinc.api.Optimizer): The optimizer.

        DOCS: https://spacy.io/api/language#initialize
        """
        if get_examples is None:
            util.logger.debug(
                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
            )
            doc = Doc(self.vocab, words=["x", "y", "z"])
            get_examples = lambda: [Example.from_dict(doc, {})]
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(
                method="Language.initialize", obj=type(get_examples)
            )
            raise TypeError(err)
        # Make sure the config is interpolated so we can resolve subsections
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
        before_init = I["before_init"]
        if before_init is not None:
            before_init(self)
        try:
            init_vocab(
                self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
            )
        except IOError:
            raise IOError(Errors.E884.format(vectors=I["vectors"]))
        if self.vocab.vectors.data.shape[1] >= 1:
            ops = get_current_ops()
            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
        if hasattr(self.tokenizer, "initialize"):
            tok_settings = validate_init_settings(
                self.tokenizer.initialize,
                I["tokenizer"],
                section="tokenizer",
                name="tokenizer",
            )
            self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
        for name, proc in self.pipeline:
            if hasattr(proc, "initialize"):
                p_settings = I["components"].get(name, {})
                p_settings = validate_init_settings(
                    proc.initialize, p_settings, section="components", name=name
                )
                proc.initialize(get_examples, nlp=self, **p_settings)
        pretrain_cfg = config.get("pretraining")
        if pretrain_cfg:
            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
            init_tok2vec(self, P, I)
        self._link_components()
        self._optimizer = sgd
        if sgd is not None:
            self._optimizer = sgd
        elif self._optimizer is None:
            self._optimizer = self.create_optimizer()
        after_init = I["after_init"]
        if after_init is not None:
            after_init(self)
        return self._optimizer

    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
        """Continue training a pretrained model.

        Create and return an optimizer, and initialize "rehearsal" for any pipeline
        component that has a .rehearse() method. Rehearsal is used to prevent
        models from "forgetting" their initialized "knowledge". To perform
        rehearsal, collect samples of text you want the models to retain performance
        on, and call nlp.rehearse() with a batch of Example objects.

        RETURNS (Optimizer): The optimizer.

        DOCS: https://spacy.io/api/language#resume_training
        """
        ops = get_current_ops()
        if self.vocab.vectors.data.shape[1] >= 1:
            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
        for name, proc in self.pipeline:
            if hasattr(proc, "_rehearsal_model"):
                proc._rehearsal_model = deepcopy(proc.model)
        if sgd is not None:
            self._optimizer = sgd
        elif self._optimizer is None:
            self._optimizer = self.create_optimizer()
        return self._optimizer

    def set_error_handler(
        self,
        error_handler: Callable[
            [str, Callable[[Doc], Doc], List[Doc], Exception], None
        ],
    ):
        """Set an error handler object for all the components in the pipeline that implement
        a set_error_handler function.

        error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]):
            Function that deals with a failing batch of documents. This callable function should take in
            the component's name, the component itself, the offending batch of documents, and the exception
            that was thrown.
        DOCS: https://spacy.io/api/language#set_error_handler
        """
        self.default_error_handler = error_handler
        for name, pipe in self.pipeline:
            if hasattr(pipe, "set_error_handler"):
                pipe.set_error_handler(error_handler)

    def evaluate(
        self,
        examples: Iterable[Example],
        *,
        batch_size: Optional[int] = None,
        scorer: Optional[Scorer] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        scorer_cfg: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Union[float, dict]]:
        """Evaluate a model's pipeline components.

        examples (Iterable[Example]): `Example` objects.
        batch_size (Optional[int]): Batch size to use.
        scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
            will be created.
        component_cfg (dict): An optional dictionary with extra keyword
            arguments for specific components.
        scorer_cfg (dict): An optional dictionary with extra keyword arguments
            for the scorer.

        RETURNS (Scorer): The scorer containing the evaluation results.

        DOCS: https://spacy.io/api/language#evaluate
        """
        examples = list(examples)
        validate_examples(examples, "Language.evaluate")
        examples = _copy_examples(examples)
        if batch_size is None:
            batch_size = self.batch_size
        if component_cfg is None:
            component_cfg = {}
        if scorer_cfg is None:
            scorer_cfg = {}
        if scorer is None:
            kwargs = dict(scorer_cfg)
            kwargs.setdefault("nlp", self)
            scorer = Scorer(**kwargs)
        # reset annotation in predicted docs and time tokenization
        start_time = timer()
        # this is purely for timing
        for eg in examples:
            self.make_doc(eg.reference.text)
        # apply all pipeline components
        for name, pipe in self.pipeline:
            kwargs = component_cfg.get(name, {})
            kwargs.setdefault("batch_size", batch_size)
            for doc, eg in zip(
                _pipe(
                    (eg.predicted for eg in examples),
                    proc=pipe,
                    name=name,
                    default_error_handler=self.default_error_handler,
                    kwargs=kwargs,
                ),
                examples,
            ):
                eg.predicted = doc
        end_time = timer()
        results = scorer.score(examples)
        n_words = sum(len(eg.predicted) for eg in examples)
        results["speed"] = n_words / (end_time - start_time)
        return results

    def create_optimizer(self):
        """Create an optimizer, usually using the [training.optimizer] config."""
        subconfig = {"optimizer": self.config["training"]["optimizer"]}
        return registry.resolve(subconfig)["optimizer"]

    @contextmanager
    def use_params(self, params: Optional[dict]):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary. Can be used as a contextmanager, in which case,
        models go back to their original weights after the block.

        params (dict): A dictionary of parameters keyed by model ID.

        EXAMPLE:
            >>> with nlp.use_params(optimizer.averages):
            >>>     nlp.to_disk("/tmp/checkpoint")

        DOCS: https://spacy.io/api/language#use_params
        """
        if not params:
            yield
        else:
            contexts = [
                pipe.use_params(params)
                for name, pipe in self.pipeline
                if hasattr(pipe, "use_params") and hasattr(pipe, "model")
            ]
            # TODO: Having trouble with contextlib
            # Workaround: these aren't actually context managers atm.
            for context in contexts:
                try:
                    next(context)
                except StopIteration:
                    pass
            yield
            for context in contexts:
                try:
                    next(context)
                except StopIteration:
                    pass

    _AnyContext = TypeVar("_AnyContext")

    @overload
    def pipe(
        self,
        texts: Iterable[Tuple[str, _AnyContext]],
        *,
        as_tuples: bool = ...,
        batch_size: Optional[int] = ...,
        disable: Iterable[str] = ...,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
        n_process: int = ...,
    ) -> Iterator[Tuple[Doc, _AnyContext]]:
        ...

    def pipe(  # noqa: F811
        self,
        texts: Iterable[str],
        *,
        as_tuples: bool = False,
        batch_size: Optional[int] = None,
        disable: Iterable[str] = SimpleFrozenList(),
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        n_process: int = 1,
    ) -> Iterator[Doc]:
        """Process texts as a stream, and yield `Doc` objects in order.

        texts (Iterable[str]): A sequence of texts to process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        batch_size (Optional[int]): The number of texts to buffer.
        disable (List[str]): Names of the pipeline components to disable.
        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
            arguments for specific components.
        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
        """
        if n_process == -1:
            n_process = mp.cpu_count()
        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
            contexts = (tc[1] for tc in text_context2)
            docs = self.pipe(
                texts,
                batch_size=batch_size,
                disable=disable,
                n_process=n_process,
                component_cfg=component_cfg,
            )
            for doc, context in zip(docs, contexts):
                yield (doc, context)
            return
        if component_cfg is None:
            component_cfg = {}
        if batch_size is None:
            batch_size = self.batch_size

        pipes = (
            []
        )  # contains functools.partial objects to easily create multiprocess worker.
        for name, proc in self.pipeline:
            if name in disable:
                continue
            kwargs = component_cfg.get(name, {})
            # Allow component_cfg to overwrite the top-level kwargs.
            kwargs.setdefault("batch_size", batch_size)
            f = functools.partial(
                _pipe,
                proc=proc,
                name=name,
                kwargs=kwargs,
                default_error_handler=self.default_error_handler,
            )
            pipes.append(f)

        if n_process != 1:
            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
        else:
            # if n_process == 1, no processes are forked.
            docs = (self.make_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)
        for doc in docs:
            yield doc

    def _multiprocessing_pipe(
        self,
        texts: Iterable[str],
        pipes: Iterable[Callable[[Doc], Doc]],
        n_process: int,
        batch_size: int,
    ) -> None:
        # raw_texts is used later to stop iteration.
        texts, raw_texts = itertools.tee(texts)
        # for sending texts to worker
        texts_q = [mp.Queue() for _ in range(n_process)]
        # for receiving byte-encoded docs from worker
        bytedocs_recv_ch, bytedocs_send_ch = zip(
            *[mp.Pipe(False) for _ in range(n_process)]
        )

        batch_texts = util.minibatch(texts, batch_size)
        # Sender sends texts to the workers.
        # This is necessary to properly handle infinite length of texts.
        # (In this case, all data cannot be sent to the workers at once)
        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
        # send twice to make process busy
        sender.send()
        sender.send()

        procs = [
            mp.Process(
                target=_apply_pipes,
                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
            )
            for rch, sch in zip(texts_q, bytedocs_send_ch)
        ]
        for proc in procs:
            proc.start()

        # Cycle channels not to break the order of docs.
        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
        byte_tuples = chain.from_iterable(
            recv.recv() for recv in cycle(bytedocs_recv_ch)
        )
        try:
            for i, (_, (byte_doc, byte_error)) in enumerate(
                zip(raw_texts, byte_tuples), 1
            ):
                if byte_doc is not None:
                    doc = Doc(self.vocab).from_bytes(byte_doc)
                    yield doc
                elif byte_error is not None:
                    error = srsly.msgpack_loads(byte_error)
                    self.default_error_handler(
                        None, None, None, ValueError(Errors.E871.format(error=error))
                    )
                if i % batch_size == 0:
                    # tell `sender` that one batch was consumed.
                    sender.step()
        finally:
            for proc in procs:
                proc.terminate()

    def _link_components(self) -> None:
        """Register 'listeners' within pipeline components, to allow them to
        effectively share weights.
        """
        # I had thought, "Why do we do this inside the Language object? Shouldn't
        # it be the tok2vec/transformer/etc's job?
        # The problem is we need to do it during deserialization...And the
        # components don't receive the pipeline then. So this does have to be
        # here :(
        for i, (name1, proc1) in enumerate(self.pipeline):
            if hasattr(proc1, "find_listeners"):
                for name2, proc2 in self.pipeline[i + 1 :]:
                    proc1.find_listeners(proc2)

    @classmethod
    def from_config(
        cls,
        config: Union[Dict[str, Any], Config] = {},
        *,
        vocab: Union[Vocab, bool] = True,
        disable: Iterable[str] = SimpleFrozenList(),
        exclude: Iterable[str] = SimpleFrozenList(),
        meta: Dict[str, Any] = SimpleFrozenDict(),
        auto_fill: bool = True,
        validate: bool = True,
    ) -> "Language":
        """Create the nlp object from a loaded config. Will set up the tokenizer
        and language data, add pipeline components etc. If no config is provided,
        the default config of the given language is used.

        config (Dict[str, Any] / Config): The loaded config.
        vocab (Vocab): A Vocab object. If True, a vocab is created.
        disable (Iterable[str]): Names of pipeline components to disable.
            Disabled pipes will be loaded but they won't be run unless you
            explicitly enable them by calling nlp.enable_pipe.
        exclude (Iterable[str]): Names of pipeline components to exclude.
            Excluded components won't be loaded.
        meta (Dict[str, Any]): Meta overrides for nlp.meta.
        auto_fill (bool): Automatically fill in missing values in config based
            on defaults and function argument annotations.
        validate (bool): Validate the component config and arguments against
            the types expected by the factory.
        RETURNS (Language): The initialized Language class.

        DOCS: https://spacy.io/api/language#from_config
        """
        if auto_fill:
            config = Config(
                cls.default_config, section_order=CONFIG_SECTION_ORDER
            ).merge(config)
        if "nlp" not in config:
            raise ValueError(Errors.E985.format(config=config))
        config_lang = config["nlp"].get("lang")
        if config_lang is not None and config_lang != cls.lang:
            raise ValueError(
                Errors.E958.format(
                    bad_lang_code=config["nlp"]["lang"],
                    lang_code=cls.lang,
                    lang=util.get_object_name(cls),
                )
            )
        config["nlp"]["lang"] = cls.lang
        # This isn't very elegant, but we remove the [components] block here to prevent
        # it from getting resolved (causes problems because we expect to pass in
        # the nlp and name args for each component). If we're auto-filling, we're
        # using the nlp.config with all defaults.
        config = util.copy_config(config)
        orig_pipeline = config.pop("components", {})
        orig_pretraining = config.pop("pretraining", None)
        config["components"] = {}
        if auto_fill:
            filled = registry.fill(config, validate=validate, schema=ConfigSchema)
        else:
            filled = config
        filled["components"] = orig_pipeline
        config["components"] = orig_pipeline
        if orig_pretraining is not None:
            filled["pretraining"] = orig_pretraining
            config["pretraining"] = orig_pretraining
        resolved_nlp = registry.resolve(
            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
        )
        create_tokenizer = resolved_nlp["tokenizer"]
        before_creation = resolved_nlp["before_creation"]
        after_creation = resolved_nlp["after_creation"]
        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
        lang_cls = cls
        if before_creation is not None:
            lang_cls = before_creation(cls)
            if (
                not isinstance(lang_cls, type)
                or not issubclass(lang_cls, cls)
                or lang_cls is not cls
            ):
                raise ValueError(Errors.E943.format(value=type(lang_cls)))

        # Warn about require_gpu usage in jupyter notebook
        warn_if_jupyter_cupy()

        # Note that we don't load vectors here, instead they get loaded explicitly
        # inside stuff like the spacy train function. If we loaded them here,
        # then we would load them twice at runtime: once when we make from config,
        # and then again when we load from disk.
        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
        if after_creation is not None:
            nlp = after_creation(nlp)
            if not isinstance(nlp, cls):
                raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
        # To create the components we need to use the final interpolated config
        # so all values are available (if component configs use variables).
        # Later we replace the component config with the raw config again.
        interpolated = filled.interpolate() if not filled.is_interpolated else filled
        pipeline = interpolated.get("components", {})
        sourced = util.get_sourced_components(interpolated)
        # If components are loaded from a source (existing models), we cache
        # them here so they're only loaded once
        source_nlps = {}
        source_nlp_vectors_hashes = {}
        vocab_b = None
        for pipe_name in config["nlp"]["pipeline"]:
            if pipe_name not in pipeline:
                opts = ", ".join(pipeline.keys())
                raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
            pipe_cfg = util.copy_config(pipeline[pipe_name])
            raw_config = Config(filled["components"][pipe_name])
            if pipe_name not in exclude:
                if "factory" not in pipe_cfg and "source" not in pipe_cfg:
                    err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
                    raise ValueError(err)
                if "factory" in pipe_cfg:
                    factory = pipe_cfg.pop("factory")
                    # The pipe name (key in the config) here is the unique name
                    # of the component, not necessarily the factory
                    nlp.add_pipe(
                        factory,
                        name=pipe_name,
                        config=pipe_cfg,
                        validate=validate,
                        raw_config=raw_config,
                    )
                else:
                    # We need the sourced components to reference the same
                    # vocab without modifying the current vocab state **AND**
                    # we still want to load the source model vectors to perform
                    # the vectors check. Since the source vectors clobber the
                    # current ones, we save the original vocab state and
                    # restore after this loop. Existing strings are preserved
                    # during deserialization, so they do not need any
                    # additional handling.
                    if vocab_b is None:
                        vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
                    model = pipe_cfg["source"]
                    if model not in source_nlps:
                        # Load with the same vocab, adding any strings
                        source_nlps[model] = util.load_model(
                            model, vocab=nlp.vocab, exclude=["lookups"]
                        )
                    source_name = pipe_cfg.get("component", pipe_name)
                    listeners_replaced = False
                    if "replace_listeners" in pipe_cfg:
                        for name, proc in source_nlps[model].pipeline:
                            if source_name in getattr(proc, "listening_components", []):
                                source_nlps[model].replace_listeners(
                                    name, source_name, pipe_cfg["replace_listeners"]
                                )
                                listeners_replaced = True
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore", message="\\[W113\\]")
                        nlp.add_pipe(
                            source_name, source=source_nlps[model], name=pipe_name
                        )
                    if model not in source_nlp_vectors_hashes:
                        source_nlp_vectors_hashes[model] = hash(
                            source_nlps[model].vocab.vectors.to_bytes()
                        )
                    if "_sourced_vectors_hashes" not in nlp.meta:
                        nlp.meta["_sourced_vectors_hashes"] = {}
                    nlp.meta["_sourced_vectors_hashes"][
                        pipe_name
                    ] = source_nlp_vectors_hashes[model]
                    # Delete from cache if listeners were replaced
                    if listeners_replaced:
                        del source_nlps[model]
        # Restore the original vocab after sourcing if necessary
        if vocab_b is not None:
            nlp.vocab.from_bytes(vocab_b)
        disabled_pipes = [*config["nlp"]["disabled"], *disable]
        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
        nlp.batch_size = config["nlp"]["batch_size"]
        nlp.config = filled if auto_fill else config
        if after_pipeline_creation is not None:
            nlp = after_pipeline_creation(nlp)
            if not isinstance(nlp, cls):
                raise ValueError(
                    Errors.E942.format(name="pipeline_creation", value=type(nlp))
                )
        # Detect components with listeners that are not frozen consistently
        for name, proc in nlp.pipeline:
            # Remove listeners not in the pipeline
            listener_names = getattr(proc, "listening_components", [])
            unused_listener_names = [
                ll for ll in listener_names if ll not in nlp.pipe_names
            ]
            for listener_name in unused_listener_names:
                for listener in proc.listener_map.get(listener_name, []):
                    proc.remove_listener(listener, listener_name)

            for listener in getattr(
                proc, "listening_components", []
            ):  # e.g. tok2vec/transformer
                # If it's a component sourced from another pipeline, we check if
                # the tok2vec listeners should be replaced with standalone tok2vec
                # models (e.g. so component can be frozen without its performance
                # degrading when other components/tok2vec are updated)
                paths = sourced.get(listener, {}).get("replace_listeners", [])
                if paths:
                    nlp.replace_listeners(name, listener, paths)
        return nlp

    def replace_listeners(
        self,
        tok2vec_name: str,
        pipe_name: str,
        listeners: Iterable[str],
    ) -> None:
        """Find listener layers (connecting to a token-to-vector embedding
        component) of a given pipeline component model and replace
        them with a standalone copy of the token-to-vector layer. This can be
        useful when training a pipeline with components sourced from an existing
        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
        the same tok2vec component, but some of them are frozen and not updated,
        their performance may degrade significally as the tok2vec component is
        updated with new data. To prevent this, listeners can be replaced with
        a standalone tok2vec layer that is owned by the component and doesn't
        change if the component isn't updated.

        tok2vec_name (str): Name of the token-to-vector component, typically
            "tok2vec" or "transformer".
        pipe_name (str): Name of pipeline component to replace listeners for.
        listeners (Iterable[str]): The paths to the listeners, relative to the
            component config, e.g. ["model.tok2vec"]. Typically, implementations
            will only connect to one tok2vec component, [model.tok2vec], but in
            theory, custom models can use multiple listeners. The value here can
            either be an empty list to not replace any listeners, or a complete
            (!) list of the paths to all listener layers used by the model.

        DOCS: https://spacy.io/api/language#replace_listeners
        """
        if tok2vec_name not in self.pipe_names:
            err = Errors.E889.format(
                tok2vec=tok2vec_name,
                name=pipe_name,
                unknown=tok2vec_name,
                opts=", ".join(self.pipe_names),
            )
            raise ValueError(err)
        if pipe_name not in self.pipe_names:
            err = Errors.E889.format(
                tok2vec=tok2vec_name,
                name=pipe_name,
                unknown=pipe_name,
                opts=", ".join(self.pipe_names),
            )
            raise ValueError(err)
        tok2vec = self.get_pipe(tok2vec_name)
        tok2vec_cfg = self.get_pipe_config(tok2vec_name)
        tok2vec_model = tok2vec.model
        if (
            not hasattr(tok2vec, "model")
            or not hasattr(tok2vec, "listener_map")
            or not hasattr(tok2vec, "remove_listener")
            or "model" not in tok2vec_cfg
        ):
            raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
        pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
        pipe = self.get_pipe(pipe_name)
        pipe_cfg = self._pipe_configs[pipe_name]
        if listeners:
            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
            if len(listeners) != len(pipe_listeners):
                # The number of listeners defined in the component model doesn't
                # match the listeners to replace, so we won't be able to update
                # the nodes and generate a matching config
                err = Errors.E887.format(
                    name=pipe_name,
                    tok2vec=tok2vec_name,
                    paths=listeners,
                    n_listeners=len(pipe_listeners),
                )
                raise ValueError(err)
            # Update the config accordingly by copying the tok2vec model to all
            # sections defined in the listener paths
            for listener_path in listeners:
                # Check if the path actually exists in the config
                try:
                    util.dot_to_object(pipe_cfg, listener_path)
                except KeyError:
                    err = Errors.E886.format(
                        name=pipe_name, tok2vec=tok2vec_name, path=listener_path
                    )
                    raise ValueError(err)
                new_config = tok2vec_cfg["model"]
                if "replace_listener_cfg" in tok2vec_model.attrs:
                    replace_func = tok2vec_model.attrs["replace_listener_cfg"]
                    new_config = replace_func(
                        tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
                    )
                util.set_dot_to_object(pipe_cfg, listener_path, new_config)
            # Go over the listener layers and replace them
            for listener in pipe_listeners:
                new_model = tok2vec_model.copy()
                if "replace_listener" in tok2vec_model.attrs:
                    new_model = tok2vec_model.attrs["replace_listener"](new_model)
                util.replace_model_node(pipe.model, listener, new_model)
                tok2vec.remove_listener(listener, pipe_name)

    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Save the current state to a directory.  If a model is loaded, this
        will include the model.

        path (str / Path): Path to a directory, which will be created if
            it doesn't exist.
        exclude (list): Names of components or serialization fields to exclude.

        DOCS: https://spacy.io/api/language#to_disk
        """
        path = util.ensure_path(path)
        serializers = {}
        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
            p, exclude=["vocab"]
        )
        serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
        serializers["config.cfg"] = lambda p: self.config.to_disk(p)
        for name, proc in self._components:
            if name in exclude:
                continue
            if not hasattr(proc, "to_disk"):
                continue
            serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
        serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
        util.to_disk(path, serializers, exclude)

    def from_disk(
        self,
        path: Union[str, Path],
        *,
        exclude: Iterable[str] = SimpleFrozenList(),
        overrides: Dict[str, Any] = SimpleFrozenDict(),
    ) -> "Language":
        """Loads state from a directory. Modifies the object in place and
        returns it. If the saved `Language` object contains a model, the
        model will be loaded.

        path (str / Path): A path to a directory.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The modified `Language` object.

        DOCS: https://spacy.io/api/language#from_disk
        """

        def deserialize_meta(path: Path) -> None:
            if path.exists():
                data = srsly.read_json(path)
                self.meta.update(data)
                # self.meta always overrides meta["vectors"] with the metadata
                # from self.vocab.vectors, so set the name directly
                self.vocab.vectors.name = data.get("vectors", {}).get("name")

        def deserialize_vocab(path: Path) -> None:
            if path.exists():
                self.vocab.from_disk(path, exclude=exclude)

        path = util.ensure_path(path)
        deserializers = {}
        if Path(path / "config.cfg").exists():
            deserializers["config.cfg"] = lambda p: self.config.from_disk(
                p, interpolate=False, overrides=overrides
            )
        deserializers["meta.json"] = deserialize_meta
        deserializers["vocab"] = deserialize_vocab
        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
            p, exclude=["vocab"]
        )
        for name, proc in self._components:
            if name in exclude:
                continue
            if not hasattr(proc, "from_disk"):
                continue
            deserializers[name] = lambda p, proc=proc: proc.from_disk(
                p, exclude=["vocab"]
            )
        if not (path / "vocab").exists() and "vocab" not in exclude:
            # Convert to list here in case exclude is (default) tuple
            exclude = list(exclude) + ["vocab"]
        util.from_disk(path, deserializers, exclude)
        self._path = path
        self._link_components()
        return self

    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
        """Serialize the current state to a binary string.

        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (bytes): The serialized form of the `Language` object.

        DOCS: https://spacy.io/api/language#to_bytes
        """
        serializers = {}
        serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
        serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
        serializers["config.cfg"] = lambda: self.config.to_bytes()
        for name, proc in self._components:
            if name in exclude:
                continue
            if not hasattr(proc, "to_bytes"):
                continue
            serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
        return util.to_bytes(serializers, exclude)

    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "Language":
        """Load state from a binary string.

        bytes_data (bytes): The data to load from.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The `Language` object.

        DOCS: https://spacy.io/api/language#from_bytes
        """

        def deserialize_meta(b):
            data = srsly.json_loads(b)
            self.meta.update(data)
            # self.meta always overrides meta["vectors"] with the metadata
            # from self.vocab.vectors, so set the name directly
            self.vocab.vectors.name = data.get("vectors", {}).get("name")

        deserializers = {}
        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
            b, interpolate=False
        )
        deserializers["meta.json"] = deserialize_meta
        deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
            b, exclude=["vocab"]
        )
        for name, proc in self._components:
            if name in exclude:
                continue
            if not hasattr(proc, "from_bytes"):
                continue
            deserializers[name] = lambda b, proc=proc: proc.from_bytes(
                b, exclude=["vocab"]
            )
        util.from_bytes(bytes_data, deserializers, exclude)
        self._link_components()
        return self


@dataclass
class FactoryMeta:
    """Dataclass containing information about a component and its defaults
    provided by the @Language.component or @Language.factory decorator. It's
    created whenever a component is defined and stored on the Language class for
    each component instance and factory instance.
    """

    factory: str
    default_config: Optional[Dict[str, Any]] = None  # noqa: E704
    assigns: Iterable[str] = tuple()
    requires: Iterable[str] = tuple()
    retokenizes: bool = False
    scores: Iterable[str] = tuple()
    default_score_weights: Optional[Dict[str, float]] = None  # noqa: E704


class DisabledPipes(list):
    """Manager for temporary pipeline disabling."""

    def __init__(self, nlp: Language, names: List[str]) -> None:
        self.nlp = nlp
        self.names = names
        for name in self.names:
            self.nlp.disable_pipe(name)
        list.__init__(self)
        self.extend(self.names)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.restore()

    def restore(self) -> None:
        """Restore the pipeline to its state when DisabledPipes was created."""
        for name in self.names:
            if name not in self.nlp.component_names:
                raise ValueError(Errors.E008.format(name=name))
            self.nlp.enable_pipe(name)
        self[:] = []


def _copy_examples(examples: Iterable[Example]) -> List[Example]:
    """Make a copy of a batch of examples, copying the predicted Doc as well.
    This is used in contexts where we need to take ownership of the examples
    so that they can be mutated, for instance during Language.evaluate and
    Language.update.
    """
    return [Example(eg.x.copy(), eg.y) for eg in examples]


def _apply_pipes(
    make_doc: Callable[[str], Doc],
    pipes: Iterable[Callable[[Doc], Doc]],
    receiver,
    sender,
    underscore_state: Tuple[dict, dict, dict],
) -> None:
    """Worker for Language.pipe

    make_doc (Callable[[str,] Doc]): Function to create Doc from text.
    pipes (Iterable[Callable[[Doc], Doc]]): The components to apply.
    receiver (multiprocessing.Connection): Pipe to receive text. Usually
        created by `multiprocessing.Pipe()`
    sender (multiprocessing.Connection): Pipe to send doc. Usually created by
        `multiprocessing.Pipe()`
    underscore_state (Tuple[dict, dict, dict]): The data in the Underscore class
        of the parent.
    """
    Underscore.load_state(underscore_state)
    while True:
        try:
            texts = receiver.get()
            docs = (make_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)
            # Connection does not accept unpickable objects, so send list.
            byte_docs = [(doc.to_bytes(), None) for doc in docs]
            padding = [(None, None)] * (len(texts) - len(byte_docs))
            sender.send(byte_docs + padding)
        except Exception:
            error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
            padding = [(None, None)] * (len(texts) - 1)
            sender.send(error_msg + padding)


class _Sender:
    """Util for sending data to multiprocessing workers in Language.pipe"""

    def __init__(
        self, data: Iterable[Any], queues: List[mp.Queue], chunk_size: int
    ) -> None:
        self.data = iter(data)
        self.queues = iter(cycle(queues))
        self.chunk_size = chunk_size
        self.count = 0

    def send(self) -> None:
        """Send chunk_size items from self.data to channels."""
        for item, q in itertools.islice(
            zip(self.data, cycle(self.queues)), self.chunk_size
        ):
            # cycle channels so that distribute the texts evenly
            q.put(item)

    def step(self) -> None:
        """Tell sender that comsumed one item. Data is sent to the workers after
        every chunk_size calls.
        """
        self.count += 1
        if self.count >= self.chunk_size:
            self.count = 0
            self.send()
-												Add the right return type for Language.pipe and an overload for the as_tuples case (#8441)

* Add the right return type for Language.pipe and an overload for the as_tuples version

* Reformat, tidy up

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
											
										
										
											2021-07-06 15:18:40 +03:00
+								from typing import Iterator, Optional, Any, Dict, Callable, Iterable, TypeVar
 								from typing import Union, List, Pattern, overload
-												Tidy up and auto-format

											
										
										
											2020-10-10 20:14:48 +03:00
+								from typing import Tuple
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								from dataclasses import dataclass
-												Rearrange multi-task learning

											
										
										
											2017-05-25 04:10:54 +03:00
+								import random
-												Update text classification model

											
										
										
											2017-07-25 19:57:59 +03:00
+								import itertools
-												Allow reasonably efficient pickling of Language class, using to_bytes() and from_bytes().

											
										
										
											2017-10-17 19:18:10 +03:00
+								import functools
-												Tidy up rest

											
										
										
											2017-10-27 22:07:59 +03:00
+								from contextlib import contextmanager
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								from copy import deepcopy
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
+								from pathlib import Path
-												Simplify warnings

											
										
										
											2020-02-28 14:20:23 +03:00
+								import warnings
-												Add option to replace listeners for sourced components

											
										
										
											2021-01-29 07:57:04 +03:00
+								from thinc.api import get_current_ops, Config, Optimizer
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								import srsly
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								import multiprocessing as mp
 								from itertools import chain, cycle
-												Move timing into Language.evaluate (#5836)

Move timing into `Language.evaluate` so that only the processing is
timing, not processing + scoring. `Language.evaluate` returns
`scores["speed"]` as words per second, which should be identical to how
the speed was added to the scores previously. Also add the speed to the
evaluate CLI output.
											
										
										
											2020-07-29 12:02:31 +03:00
+								from timeit import default_timer as timer
-												Handle errors while multiprocessing (#8004)

* Handle errors while multiprocessing

Handle errors while multiprocessing without hanging.

* Return the traceback for errors raised while processing a batch, which
  can be handled by the top-level error handler
* Allow for shortened batches due to custom error handlers that ignore
  errors and skip documents

* Define custom components at a higher level

* Also move up custom error handler

* Use simpler component for test

* Switch error type

* Adjust test

* Only call top-level error handler for exceptions

* Register custom test components within tests

Use global functions (so they can be pickled) but register the
components only within the individual tests.
											
										
										
											2021-05-17 14:28:39 +03:00
+								import traceback
-												Fix GPU usage in Language

											
										
										
											2017-05-18 12:25:19 +03:00
-												load Underscore state when multiprocessing

											
										
										
											2020-02-12 13:50:42 +03:00
+								from .tokens.underscore import Underscore
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								from .vocab import Vocab, create_vocab
-												Simplify pipe analysis

- remove unused code
- don't print by default
- integrate attrs info into analysis output

											
										
										
											2020-08-01 14:40:06 +03:00
+								from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-												Renaming gold & annotation_setter (#6042)

* version bump to 3.0.0a16

* rename "gold" folder to "training"

* rename 'annotation_setter' to 'set_extra_annotations'

* formatting
											
										
										
											2020-09-09 11:31:03 +03:00
+								from .training import Example, validate_examples
-												Simplify config use in Language.initialize

											
										
										
											2020-09-29 17:05:48 +03:00
+								from .training.initialize import init_vocab, init_tok2vec
-												Fix formatting, tidy up and remove unused imports

											
										
										
											2017-10-07 01:26:05 +03:00
+								from .scorer import Scorer
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								from .util import registry, SimpleFrozenList, _pipe, raise_error
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-												Add warning about GPU selection in Jupyter notebooks (#7075)

* Initial warning

* Update check

* Redo edit

* Move jupyter warning to helper method

* Add link with details to warnings
											
										
										
											2021-03-09 17:35:21 +03:00
+								from .util import warn_if_jupyter_cupy
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 								from .lang.punctuation import TOKENIZER_INFIXES
-												Tidy up and move noun_chunks, token_match, url_match

											
										
										
											2020-07-22 23:18:46 +03:00
+								from .tokens import Doc
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								from .tokenizer import Tokenizer
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								from .errors import Errors, Warnings
-												Simplify config use in Language.initialize

											
										
										
											2020-09-29 17:05:48 +03:00
+								from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
 								from .schemas import ConfigSchemaPretrain, validate_init_settings
-												Include git commit in package and model meta (#5694)

* Include git commit in package and model meta

* Rewrite to read file in setup

* Fix file handle
											
										
										
											2020-07-02 18:10:27 +03:00
+								from .git_info import GIT_VERSION
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								from . import util
-												Fix formatting, tidy up and remove unused imports

											
										
										
											2017-10-07 01:26:05 +03:00
+								from . import about
-												Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```

											
										
										
											2020-09-18 16:45:55 +03:00
+								from .lookups import load_lookups
-												Refactor training, with new spacy.train module. Defaults still a little awkward.

											
										
										
											2016-10-09 13:24:24 +03:00
-												* Tagger training now working. Still need to test load/save of model. Morphology still broken.

											
										
										
											2015-08-27 10:16:11 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								# This is the base config will all settings (training etc.)
 								DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
-												Fix handling of optional [pretraining] block (#5954)

* Fix handling of optional [pretraining] block

* Remote pretraining from default config

* Fix test

* Add schema option for empty pretrain block
											
										
										
											2020-08-24 16:56:03 +03:00
+								# This is the base config for the [pretraining] block and currently not included
 								# in the main config and only added via the 'init fill-config' command
 								DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class BaseDefaults:
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								    """Language data defaults, available via Language.Defaults. Can be
 								    overwritten by language subclasses by defining their own subclasses of
 								    Language.Defaults.
 								    """
-												Update docs and consistency

											
										
										
											2020-07-29 16:14:07 +03:00
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    config: Config = Config(section_order=CONFIG_SECTION_ORDER)
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								    tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
 								    prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
 								    suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
 								    infixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_INFIXES
 								    token_match: Optional[Pattern] = None
 								    url_match: Optional[Pattern] = URL_MATCH
 								    syntax_iterators: Dict[str, Callable] = {}
 								    lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
 								    stop_words = set()
 								    writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
 								@registry.tokenizers("spacy.Tokenizer.v1")
 								def create_tokenizer() -> Callable[["Language"], Tokenizer]:
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								    """Registered function to create a tokenizer. Returns a factory that takes
 								    the nlp object and returns a Tokenizer instance using the language detaults.
 								    """
-												Update docs and consistency

											
										
										
											2020-07-29 16:14:07 +03:00
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								    def tokenizer_factory(nlp: "Language") -> Tokenizer:
 								        prefixes = nlp.Defaults.prefixes
 								        suffixes = nlp.Defaults.suffixes
 								        infixes = nlp.Defaults.infixes
 								        prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
 								        suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
 								        infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
 								        return Tokenizer(
 								            nlp.vocab,
 								            rules=nlp.Defaults.tokenizer_exceptions,
 								            prefix_search=prefix_search,
 								            suffix_search=suffix_search,
 								            infix_finditer=infix_finditer,
 								            token_match=nlp.Defaults.token_match,
 								            url_match=nlp.Defaults.url_match,
 								        )
 								    return tokenizer_factory
-												Minor renaming / refactoring

* Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message
* Make `Vocab.lookups` a property

											
										
										
											2020-09-18 20:43:19 +03:00
+								@registry.misc("spacy.LookupsDataLoader.v1")
-												Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```

											
										
										
											2020-09-18 16:45:55 +03:00
+								def load_lookups_data(lang, tables):
-												Minor renaming / refactoring

* Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message
* Make `Vocab.lookups` a property

											
										
										
											2020-09-18 20:43:19 +03:00
+								    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
-												Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```

											
										
										
											2020-09-18 16:45:55 +03:00
+								    lookups = load_lookups(lang=lang, tables=tables)
 								    return lookups
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class Language:
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								    """A text-processing pipeline. Usually you'll load this once per process,
 								    and pass the instance around your application.
-												Update docstrings and API docs for Language

											
										
										
											2017-05-19 19:47:24 +03:00
 								    Defaults (class): Settings, data and factory methods for creating the `nlp`
 								        object and processing pipeline.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    lang (str): Two-letter language ID, i.e. ISO code.
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								    DOCS: https://spacy.io/api/language
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    """
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    Defaults = BaseDefaults
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    lang: str = None
 								    default_config = DEFAULT_CONFIG
-												* Add language base class

											
										
										
											2015-08-25 16:37:17 +03:00
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								    factories = SimpleFrozenDict(error=Errors.E957)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    _factory_meta: Dict[str, "FactoryMeta"] = {}  # meta by factory
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    def __init__(
-												Tidy up and auto-format

											
										
										
											2020-02-28 13:57:41 +03:00
+								        self,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        vocab: Union[Vocab, bool] = True,
-												Make more args keyword-only

											
										
										
											2020-07-27 01:27:53 +03:00
+								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        max_length: int = 10 ** 6,
 								        meta: Dict[str, Any] = {},
 								        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        batch_size: int = 1000,
-												Tidy up and auto-format

											
										
										
											2020-02-28 13:57:41 +03:00
+								        **kwargs,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								    ) -> None:
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """Initialise a Language object.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        meta (dict): Custom meta data for the Language class. Is written to by
 								            models to add model meta data.
-												Re-add setting for vocab data and tidy up

											
										
										
											2020-07-25 13:14:28 +03:00
+								        max_length (int): Maximum number of characters in a single text. The
 								            current models may run out memory on extremely long texts, due to
 								            large internal allocations. You should segment these texts into
 								            meaningful units, e.g. paragraphs, subsections etc, before passing
 								            them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
 								            a rule of thumb, if all pipeline components are enabled, spaCy's
 								            default models currently requires roughly 1GB of temporary memory per
-												Add input length error, to address #1826

											
										
										
											2018-03-29 22:45:26 +03:00
+,000 characters in one text.
-												Re-add setting for vocab data and tidy up

											
										
										
											2020-07-25 13:14:28 +03:00
+								        create_tokenizer (Callable): Function that takes the nlp object and
 								            returns a tokenizer.
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        batch_size (int): Default batch size for pipe and evaluate.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#init
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        # We're only calling this to import all factories provided via entry
 								        # points. The factory decorator applied to these functions takes care
 								        # of the rest.
 								        util.registry._entry_point_factories.get_all()
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        self._config = DEFAULT_CONFIG.merge(self.default_config)
-												Compute Language.meta with a property

											
										
										
											2017-07-23 01:50:18 +03:00
+								        self._meta = dict(meta)
-												Add Language.path

											
										
										
											2017-10-25 12:57:43 +03:00
+								        self._path = None
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self._optimizer = None
 								        # Component meta and configs are only needed on the instance
 								        self._pipe_meta: Dict[str, "FactoryMeta"] = {}  # meta by component
 								        self._pipe_configs: Dict[str, Config] = {}  # config by component
-												Raise for bad Vocab values

											
										
										
											2020-09-15 14:25:34 +03:00
+								        if not isinstance(vocab, Vocab) and vocab is not True:
 								            raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
+								        if vocab is True:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            vectors_name = meta.get("vectors", {}).get("name")
-												Tidy up and auto-format

											
										
										
											2020-09-21 11:59:07 +03:00
+								            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								        else:
 								            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
 								                raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								        self.vocab: Vocab = vocab
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if self.lang is None:
 								            self.lang = self.vocab.lang
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._components = []
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
+								        self._disabled = set()
-												Add input length error, to address #1826

											
										
										
											2018-03-29 22:45:26 +03:00
+								        self.max_length = max_length
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        # Create the default tokenizer from the default config
 								        if not create_tokenizer:
 								            tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								            create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self.tokenizer = create_tokenizer(self)
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        self.batch_size = batch_size
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								        self.default_error_handler = raise_error
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
 								    def __init_subclass__(cls, **kwargs):
 								        super().__init_subclass__(**kwargs)
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								        cls.default_config["nlp"]["lang"] = cls.lang
-												* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue #125: allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve.

											
										
										
											2015-10-12 11:33:11 +03:00
-												Add Language.path

											
										
										
											2017-10-25 12:57:43 +03:00
+								    @property
 								    def path(self):
 								        return self._path
-												Compute Language.meta with a property

											
										
										
											2017-07-23 01:50:18 +03:00
+								    @property
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def meta(self) -> Dict[str, Any]:
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        """Custom meta data of the language class. If a model is loaded, this
 								        includes details from the model's meta.json.
 								        RETURNS (Dict[str, Any]): The meta.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#meta
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        """
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								        spacy_version = util.get_minor_version_range(about.__version__)
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								        if self.vocab.lang:
 								            self._meta.setdefault("lang", self.vocab.lang)
 								        else:
 								            self._meta.setdefault("lang", self.lang)
-												"model" terminology consistency in docs

											
										
										
											2020-09-03 14:13:03 +03:00
+								        self._meta.setdefault("name", "pipeline")
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        self._meta.setdefault("version", "0.0.0")
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								        self._meta.setdefault("spacy_version", spacy_version)
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        self._meta.setdefault("description", "")
 								        self._meta.setdefault("author", "")
 								        self._meta.setdefault("email", "")
 								        self._meta.setdefault("url", "")
 								        self._meta.setdefault("license", "")
-												Include git commit in package and model meta (#5694)

* Include git commit in package and model meta

* Rewrite to read file in setup

* Fix file handle
											
										
										
											2020-07-02 18:10:27 +03:00
+								        self._meta.setdefault("spacy_git_version", GIT_VERSION)
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        self._meta["vectors"] = {
 								            "width": self.vocab.vectors_length,
 								            "vectors": len(self.vocab.vectors),
 								            "keys": self.vocab.vectors.n_keys,
 								            "name": self.vocab.vectors.name,
 								        }
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._meta["labels"] = dict(self.pipe_labels)
-												Re-add meta["pipeline"] for now

											
										
										
											2020-07-28 17:14:23 +03:00
+								        # TODO: Adding this back to prevent breaking people's code etc., but
 								        # we should consider removing it
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._meta["pipeline"] = list(self.pipe_names)
-												Fix components in meta.json and website [ci skip]

											
										
										
											2020-09-04 15:42:12 +03:00
+								        self._meta["components"] = list(self.component_names)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._meta["disabled"] = list(self.disabled)
-												Compute Language.meta with a property

											
										
										
											2017-07-23 01:50:18 +03:00
+								        return self._meta
 								    @meta.setter
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def meta(self, value: Dict[str, Any]) -> None:
-												Compute Language.meta with a property

											
										
										
											2017-07-23 01:50:18 +03:00
+								        self._meta = value
-												Add lookup properties for components in Language

											
										
										
											2017-06-04 23:52:09 +03:00
+								    @property
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def config(self) -> Config:
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        """Trainable config for the current language instance. Includes the
 								        current pipeline components, as well as default training config.
 								        RETURNS (thinc.api.Config): The config.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#config
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self._config.setdefault("nlp", {})
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								        self._config.setdefault("training", {})
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self._config["nlp"]["lang"] = self.lang
 								        # We're storing the filled config for each pipeline component and so
 								        # we can populate the config again later
 								        pipeline = {}
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								        score_weights = []
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        for pipe_name in self.component_names:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            pipe_meta = self.get_pipe_meta(pipe_name)
 								            pipe_config = self.get_pipe_config(pipe_name)
-												@factories -> factory (#5801)


											
										
										
											2020-07-22 18:29:31 +03:00
+								            pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
-												Add and update score methods and score weights

Add and update `score` methods, provided `scores`, and default weights
`default_score_weights` for pipeline components.

* `scores` provides all top-level keys returned by `score` (merely informative, similar to `assigns`).
* `default_score_weights` provides the default weights for a default config.
* The keys from `default_score_weights` determine which values will be
shown in the `spacy train` output, so keys with weight `0.0` will be
displayed but not counted toward the overall score.

											
										
										
											2020-07-27 13:27:40 +03:00
+								            if pipe_meta.default_score_weights:
 								                score_weights.append(pipe_meta.default_score_weights)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._config["nlp"]["pipeline"] = list(self.component_names)
 								        self._config["nlp"]["disabled"] = list(self.disabled)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self._config["components"] = pipeline
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								        # We're merging the existing score weights back into the combined
 								        # weights to make sure we're preserving custom settings in the config
 								        # but also reflect updates (e.g. new components added)
-												Fix combined scores and update test

											
										
										
											2020-09-24 11:42:47 +03:00
+								        prev_weights = self._config["training"].get("score_weights", {})
 								        combined_score_weights = combine_score_weights(score_weights, prev_weights)
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								        self._config["training"]["score_weights"] = combined_score_weights
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if not srsly.is_json_serializable(self._config):
 								            raise ValueError(Errors.E961.format(config=self._config))
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
+								        return self._config
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    @config.setter
 								    def config(self, value: Config) -> None:
 								        self._config = value
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
+								    @property
 								    def disabled(self) -> List[str]:
 								        """Get the names of all disabled components.
 								        RETURNS (List[str]): The disabled components.
 								        """
-												Work around set order in Language.disabled

											
										
										
											2020-08-29 13:58:22 +03:00
+								        # Make sure the disabled components are returned in the order they
 								        # appear in the pipeline (which isn't guaranteed by the set)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        names = [name for name, _ in self._components if name in self._disabled]
 								        return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								    @property
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def factory_names(self) -> List[str]:
 								        """Get names of all available factories.
 								        RETURNS (List[str]): The factory names.
 								        """
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        names = list(self.factories.keys())
 								        return SimpleFrozenList(names)
 								    @property
 								    def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
 								        """Get all (name, component) tuples in the pipeline, including the
 								        currently disabled components.
 								        """
 								        return SimpleFrozenList(
 								            self._components, error=Errors.E926.format(attr="components")
 								        )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    @property
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								    def component_names(self) -> List[str]:
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        """Get the names of the available pipeline components. Includes all
 								        active and inactive pipeline components.
 								        RETURNS (List[str]): List of component name strings, in order.
 								        """
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        names = [pipe_name for pipe_name, _ in self._components]
 								        return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
 								    @property
 								    def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
 								        """The processing pipeline consisting of (name, component) tuples. The
 								        components are called on the Doc in order as it passes through the
 								        pipeline.
 								        RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
 								        """
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        pipes = [(n, p) for n, p in self._components if n not in self._disabled]
 								        return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
 								    @property
 								    def pipe_names(self) -> List[str]:
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        """Get names of available active pipeline components.
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        RETURNS (List[str]): List of component name strings, in order.
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        names = [pipe_name for pipe_name, _ in self.pipeline]
 								        return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
+								    @property
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def pipe_factories(self) -> Dict[str, str]:
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
+								        """Get the component factories for the available pipeline components.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        RETURNS (Dict[str, str]): Factory names, keyed by component names.
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
+								        """
 								        factories = {}
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for pipe_name, pipe in self._components:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        return SimpleFrozenDict(factories)
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
-												💫 Add Language.pipe_labels (#4276)

* Add Language.pipe_labels

* Update spacy/language.py

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

											
										
										
											2019-09-12 11:56:28 +03:00
+								    @property
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def pipe_labels(self) -> Dict[str, List[str]]:
-												Update Language docs [ci skip]

											
										
										
											2019-09-12 14:03:38 +03:00
+								        """Get the labels set by the pipeline components, if available (if
 								        the component exposes a labels property).
-												💫 Add Language.pipe_labels (#4276)

* Add Language.pipe_labels

* Update spacy/language.py

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

											
										
										
											2019-09-12 11:56:28 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        RETURNS (Dict[str, List[str]]): Labels keyed by component name.
-												💫 Add Language.pipe_labels (#4276)

* Add Language.pipe_labels

* Update spacy/language.py

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

											
										
										
											2019-09-12 11:56:28 +03:00
+								        """
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        labels = {}
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for name, pipe in self._components:
-												💫 Add Language.pipe_labels (#4276)

* Add Language.pipe_labels

* Update spacy/language.py

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

											
										
										
											2019-09-12 11:56:28 +03:00
+								            if hasattr(pipe, "labels"):
 								                labels[name] = list(pipe.labels)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        return SimpleFrozenDict(labels)
-												💫 Add Language.pipe_labels (#4276)

* Add Language.pipe_labels

* Update spacy/language.py

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

											
										
										
											2019-09-12 11:56:28 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    @classmethod
 								    def has_factory(cls, name: str) -> bool:
 								        """RETURNS (bool): Whether a factory of that name is registered."""
 								        internal_name = cls.get_factory_name(name)
 								        return name in registry.factories or internal_name in registry.factories
 								    @classmethod
 								    def get_factory_name(cls, name: str) -> str:
 								        """Get the internal factory name based on the language subclass.
 								        name (str): The factory name.
 								        RETURNS (str): The internal factory name.
 								        """
 								        if cls.lang is None:
 								            return name
 								        return f"{cls.lang}.{name}"
 								    @classmethod
 								    def get_factory_meta(cls, name: str) -> "FactoryMeta":
 								        """Get the meta information for a given factory name.
 								        name (str): The component factory name.
 								        RETURNS (FactoryMeta): The meta for the given factory name.
 								        """
 								        internal_name = cls.get_factory_name(name)
 								        if internal_name in cls._factory_meta:
 								            return cls._factory_meta[internal_name]
 								        if name in cls._factory_meta:
 								            return cls._factory_meta[name]
 								        raise ValueError(Errors.E967.format(meta="factory", name=name))
 								    @classmethod
 								    def set_factory_meta(cls, name: str, value: "FactoryMeta") -> None:
 								        """Set the meta information for a given factory name.
 								        name (str): The component factory name.
 								        value (FactoryMeta): The meta to set.
 								        """
 								        cls._factory_meta[cls.get_factory_name(name)] = value
 								    def get_pipe_meta(self, name: str) -> "FactoryMeta":
 								        """Get the meta information for a given component name.
 								        name (str): The component name.
 								        RETURNS (FactoryMeta): The meta for the given component name.
 								        """
 								        if name not in self._pipe_meta:
 								            raise ValueError(Errors.E967.format(meta="component", name=name))
 								        return self._pipe_meta[name]
 								    def get_pipe_config(self, name: str) -> Config:
 								        """Get the config used to create a pipeline component.
 								        name (str): The component name.
 								        RETURNS (Config): The config used to create the pipeline component.
 								        """
 								        if name not in self._pipe_configs:
 								            raise ValueError(Errors.E960.format(name=name))
 								        pipe_config = self._pipe_configs[name]
 								        return pipe_config
 								    @classmethod
 								    def factory(
 								        cls,
 								        name: str,
 								        *,
 								        default_config: Dict[str, Any] = SimpleFrozenDict(),
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        assigns: Iterable[str] = SimpleFrozenList(),
 								        requires: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        retokenizes: bool = False,
-												Add and update score methods and score weights

Add and update `score` methods, provided `scores`, and default weights
`default_score_weights` for pipeline components.

* `scores` provides all top-level keys returned by `score` (merely informative, similar to `assigns`).
* `default_score_weights` provides the default weights for a default config.
* The keys from `default_score_weights` determine which values will be
shown in the `spacy train` output, so keys with weight `0.0` will be
displayed but not counted toward the overall score.

											
										
										
											2020-07-27 13:27:40 +03:00
+								        default_score_weights: Dict[str, float] = SimpleFrozenDict(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        func: Optional[Callable] = None,
 								    ) -> Callable:
 								        """Register a new pipeline component factory. Can be used as a decorator
 								        on a function or classmethod, or called as a function with the factory
 								        provided as the func keyword argument. To create a component and add
 								        it to the pipeline, you can use nlp.add_pipe(name).
 								        name (str): The name of the component factory.
 								        default_config (Dict[str, Any]): Default configuration, describing the
 								            default values of the factory arguments.
 								        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
-												Fix typo in Language docstrings (#7958)


											
										
										
											2021-05-03 15:44:09 +03:00
+								            e.g. "token.ent_id". Used for pipeline analysis.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        requires (Iterable[str]): Doc/Token attributes required by this component,
-												Fix typo in Language docstrings (#7958)


											
										
										
											2021-05-03 15:44:09 +03:00
+								            e.g. "token.ent_id". Used for pipeline analysis.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        retokenizes (bool): Whether the component changes the tokenization.
 								            Used for pipeline analysis.
-												Remove scores list from config and document

											
										
										
											2020-07-28 12:22:24 +03:00
+								        default_score_weights (Dict[str, float]): The scores to report during
 								            training, and their default weight towards the final score used to
 								            select the best model. Weights should sum to 1.0 per component and
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								            will be combined and normalized for the whole pipeline. If None,
 								            the score won't be shown in the logs or be weighted.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        func (Optional[Callable]): Factory function if not used as a decorator.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#factory
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        """
 								        if not isinstance(name, str):
 								            raise ValueError(Errors.E963.format(decorator="factory"))
 								        if not isinstance(default_config, dict):
 								            err = Errors.E962.format(
 								                style="default config", name=name, cfg_type=type(default_config)
 								            )
 								            raise ValueError(err)
 								        def add_factory(factory_func: Callable) -> Callable:
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
+								            internal_name = cls.get_factory_name(name)
 								            if internal_name in registry.factories:
 								                # We only check for the internal name here – it's okay if it's a
 								                # subclass and the base class has a factory of the same name. We
 								                # also only raise if the function is different to prevent raising
 								                # if module is reloaded.
 								                existing_func = registry.factories.get(internal_name)
 								                if not util.is_same_func(factory_func, existing_func):
 								                    err = Errors.E004.format(
 								                        name=name, func=existing_func, new_func=factory_func
 								                    )
 								                    raise ValueError(err)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            arg_names = util.get_arg_names(factory_func)
 								            if "nlp" not in arg_names or "name" not in arg_names:
 								                raise ValueError(Errors.E964.format(name=name))
 								            # Officially register the factory so we can later call
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								            # registry.resolve and refer to it in the config as
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            # @factories = "spacy.Language.xyz". We use the class name here so
 								            # different classes can have different factories.
 								            registry.factories.register(internal_name, func=factory_func)
 								            factory_meta = FactoryMeta(
 								                factory=name,
 								                default_config=default_config,
 								                assigns=validate_attrs(assigns),
 								                requires=validate_attrs(requires),
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								                scores=list(default_score_weights.keys()),
-												Add and update score methods and score weights

Add and update `score` methods, provided `scores`, and default weights
`default_score_weights` for pipeline components.

* `scores` provides all top-level keys returned by `score` (merely informative, similar to `assigns`).
* `default_score_weights` provides the default weights for a default config.
* The keys from `default_score_weights` determine which values will be
shown in the `spacy train` output, so keys with weight `0.0` will be
displayed but not counted toward the overall score.

											
										
										
											2020-07-27 13:27:40 +03:00
+								                default_score_weights=default_score_weights,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								                retokenizes=retokenizes,
 								            )
 								            cls.set_factory_meta(name, factory_meta)
 								            # We're overwriting the class attr with a frozen dict to handle
 								            # backwards-compat (writing to Language.factories directly). This
 								            # wouldn't work with an instance property and just produce a
 								            # confusing error – here we can show a custom error
 								            cls.factories = SimpleFrozenDict(
 								                registry.factories.get_all(), error=Errors.E957
 								            )
 								            return factory_func
 								        if func is not None:  # Support non-decorator use cases
 								            return add_factory(func)
 								        return add_factory
 								    @classmethod
 								    def component(
 								        cls,
 								        name: Optional[str] = None,
 								        *,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        assigns: Iterable[str] = SimpleFrozenList(),
 								        requires: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        retokenizes: bool = False,
 								        func: Optional[Callable[[Doc], Doc]] = None,
 								    ) -> Callable:
 								        """Register a new pipeline component. Can be used for stateless function
 								        components that don't require a separate factory. Can be used as a
 								        decorator on a function or classmethod, or called as a function with the
 								        factory provided as the func keyword argument. To create a component and
 								        add it to the pipeline, you can use nlp.add_pipe(name).
 								        name (str): The name of the component factory.
 								        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
-												Fix typo in Language docstrings (#7958)


											
										
										
											2021-05-03 15:44:09 +03:00
+								            e.g. "token.ent_id". Used for pipeline analysis.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        requires (Iterable[str]): Doc/Token attributes required by this component,
-												Fix typo in Language docstrings (#7958)


											
										
										
											2021-05-03 15:44:09 +03:00
+								            e.g. "token.ent_id". Used for pipeline analysis.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        retokenizes (bool): Whether the component changes the tokenization.
 								            Used for pipeline analysis.
 								        func (Optional[Callable]): Factory function if not used as a decorator.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#component
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        """
 								        if name is not None and not isinstance(name, str):
 								            raise ValueError(Errors.E963.format(decorator="component"))
 								        component_name = name if name is not None else util.get_object_name(func)
 								        def add_component(component_func: Callable[[Doc], Doc]) -> Callable:
 								            if isinstance(func, type):  # function is a class
 								                raise ValueError(Errors.E965.format(name=component_name))
 								            def factory_func(nlp: cls, name: str) -> Callable[[Doc], Doc]:
 								                return component_func
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
+								            internal_name = cls.get_factory_name(name)
 								            if internal_name in registry.factories:
 								                # We only check for the internal name here – it's okay if it's a
 								                # subclass and the base class has a factory of the same name. We
 								                # also only raise if the function is different to prevent raising
 								                # if module is reloaded. It's hacky, but we need to check the
 								                # existing functure for a closure and whether that's identical
 								                # to the component function (because factory_func created above
 								                # will always be different, even for the same function)
 								                existing_func = registry.factories.get(internal_name)
 								                closure = existing_func.__closure__
 								                wrapped = [c.cell_contents for c in closure][0] if closure else None
 								                if util.is_same_func(wrapped, component_func):
 								                    factory_func = existing_func  # noqa: F811
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            cls.factory(
 								                component_name,
 								                assigns=assigns,
 								                requires=requires,
 								                retokenizes=retokenizes,
 								                func=factory_func,
 								            )
 								            return component_func
 								        if func is not None:  # Support non-decorator use cases
 								            return add_component(func)
 								        return add_component
-												Integrate and simplify pipe analysis

											
										
										
											2020-07-31 19:34:35 +03:00
+								    def analyze_pipes(
 								        self,
 								        *,
 								        keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
-												Simplify pipe analysis

- remove unused code
- don't print by default
- integrate attrs info into analysis output

											
										
										
											2020-08-01 14:40:06 +03:00
+								        pretty: bool = False,
-												Integrate and simplify pipe analysis

											
										
										
											2020-07-31 19:34:35 +03:00
+								    ) -> Optional[Dict[str, Any]]:
 								        """Analyze the current pipeline components, print a summary of what
 								        they assign or require and check that all requirements are met.
 								        keys (List[str]): The meta values to display in the table. Corresponds
 								            to values in FactoryMeta, defined by @Language.factory decorator.
-												Simplify pipe analysis

- remove unused code
- don't print by default
- integrate attrs info into analysis output

											
										
										
											2020-08-01 14:40:06 +03:00
+								        pretty (bool): Pretty-print the results.
 								        RETURNS (dict): The data.
-												Integrate and simplify pipe analysis

											
										
										
											2020-07-31 19:34:35 +03:00
+								        """
-												Simplify pipe analysis

- remove unused code
- don't print by default
- integrate attrs info into analysis output

											
										
										
											2020-08-01 14:40:06 +03:00
+								        analysis = analyze_pipes(self, keys=keys)
 								        if pretty:
 								            print_pipe_analysis(analysis, keys=keys)
 								        return analysis
-												Integrate and simplify pipe analysis

											
										
										
											2020-07-31 19:34:35 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """Get a pipeline component for a given component name.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of pipeline component to get.
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        RETURNS (callable): The pipeline component.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#get_pipe
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for pipe_name, component in self._components:
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								            if pipe_name == name:
 								                return component
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def create_pipe(
 								        self,
 								        factory_name: str,
 								        name: Optional[str] = None,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												config is not Optional (#9024)


											
										
										
											2021-08-27 12:44:31 +03:00
+								        config: Dict[str, Any] = SimpleFrozenDict(),
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        raw_config: Optional[Config] = None,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        validate: bool = True,
 								    ) -> Callable[[Doc], Doc]:
 								        """Create a pipeline component. Mostly used internally. To create and
 								        add a component to the pipeline, you can use nlp.add_pipe.
 								        factory_name (str): Name of component factory.
 								        name (Optional[str]): Optional name to assign to component instance.
 								            Defaults to factory name if not set.
-												config is not Optional (#9024)


											
										
										
											2021-08-27 12:44:31 +03:00
+								        config (Dict[str, Any]): Config parameters to use for this component.
 								            Will be merged with default config, if available.
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        raw_config (Optional[Config]): Internals: the non-interpolated config.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        validate (bool): Whether to validate the component config against the
 								            arguments and types expected by the factory.
 								        RETURNS (Callable[[Doc], Doc]): The pipeline component.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#create_pipe
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        name = name if name is not None else factory_name
 								        if not isinstance(config, dict):
 								            err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
 								            raise ValueError(err)
 								        if not srsly.is_json_serializable(config):
 								            raise ValueError(Errors.E961.format(config=config))
 								        if not self.has_factory(factory_name):
 								            err = Errors.E002.format(
 								                name=factory_name,
 								                opts=", ".join(self.factory_names),
 								                method="create_pipe",
 								                lang=util.get_object_name(self),
 								                lang_code=self.lang,
-												Merge branch 'develop' into master-tmp

											
										
										
											2020-05-21 19:39:06 +03:00
+								            )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            raise ValueError(err)
 								        pipe_meta = self.get_factory_meta(factory_name)
 								        # This is unideal, but the alternative would mean you always need to
 								        # specify the full config settings, which is not really viable.
 								        if pipe_meta.default_config:
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								            config = Config(pipe_meta.default_config).merge(config)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        internal_name = self.get_factory_name(factory_name)
 								        # If the language-specific factory doesn't exist, try again with the
 								        # not-specific name
 								        if internal_name not in registry.factories:
 								            internal_name = factory_name
-												Tidy up code comments [ci skip]

											
										
										
											2021-01-27 04:40:03 +03:00
+								        # The name allows components to know their pipe name and use it in the
 								        # losses etc. (even if multiple instances of the same factory are used)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        config = {"nlp": self, "name": name, **config, "@factories": internal_name}
-												Tidy up code comments [ci skip]

											
										
										
											2021-01-27 04:40:03 +03:00
+								        # We need to create a top-level key because Thinc doesn't allow resolving
 								        # top-level references to registered functions. Also gives nicer errors.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        cfg = {factory_name: config}
 								        # We're calling the internal _fill here to avoid constructing the
 								        # registered functions twice
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								        resolved = registry.resolve(cfg, validate=validate)
 								        filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
 								        filled = Config(filled)
-												@factories -> factory (#5801)


											
										
										
											2020-07-22 18:29:31 +03:00
+								        filled["factory"] = factory_name
-												Make sure @factories is removed from config

											
										
										
											2020-07-26 16:11:24 +03:00
+								        filled.pop("@factories", None)
-												Remove extra pipe config values before merging

											
										
										
											2020-09-15 15:24:17 +03:00
+								        # Remove the extra values we added because we don't want to keep passing
 								        # them around, copying them etc.
 								        filled.pop("nlp", None)
 								        filled.pop("name", None)
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        # Merge the final filled config with the raw config (including non-
 								        # interpolated variables)
 								        if raw_config:
 								            filled = filled.merge(raw_config)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self._pipe_configs[name] = filled
 								        return resolved[factory_name]
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    def create_pipe_from_source(
-												Pipe API (#6034)

* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
											
										
										
											2020-09-08 23:44:25 +03:00
+								        self, source_name: str, source: "Language", *, name: str
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    ) -> Tuple[Callable[[Doc], Doc], str]:
 								        """Create a pipeline component by copying it from an existing model.
 								        source_name (str): Name of the component in the source pipeline.
 								        source (Language): The source nlp object to copy from.
 								        name (str): Optional alternative name to use in current pipeline.
 								        RETURNS (Tuple[Callable, str]): The component and its factory name.
 								        """
-												Improve checks for sourced components (#7490)

* Improve checks for sourced components

* Remove language class checks

* Convert python warning to logger warning

* Remove unused warning

* Fix formatting
											
										
										
											2021-04-19 11:36:32 +03:00
+								        # Check source type
 								        if not isinstance(source, Language):
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								            raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
-												Improve checks for sourced components (#7490)

* Improve checks for sourced components

* Remove language class checks

* Convert python warning to logger warning

* Remove unused warning

* Fix formatting
											
										
										
											2021-04-19 11:36:32 +03:00
+								        # Check vectors, with faster checks first
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								        if (
 								            self.vocab.vectors.shape != source.vocab.vectors.shape
 								            or self.vocab.vectors.key2row != source.vocab.vectors.key2row
 								            or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
 								        ):
-												Use warnings.warn instead of logger.warning

											
										
										
											2021-06-04 18:44:04 +03:00
+								            warnings.warn(Warnings.W113.format(name=source_name))
-												Tidy up with flake8: imports, comparisons, etc.

											
										
										
											2021-06-28 13:03:29 +03:00
+								        if source_name not in source.component_names:
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								            raise KeyError(
 								                Errors.E944.format(
 								                    name=source_name,
 								                    model=f"{source.meta['lang']}_{source.meta['name']}",
-												Allow sourcing disabled components (#7215)

Check `component_names` instead of `pipe_names` to allow sourcing
disabled components.
											
										
										
											2021-02-26 15:50:56 +03:00
+								                    opts=", ".join(source.component_names),
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								                )
 								            )
 								        pipe = source.get_pipe(source_name)
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        # Make sure the source config is interpolated so we don't end up with
 								        # orphaned variables in our final config
 								        source_config = source.config.interpolate()
 								        pipe_config = util.copy_config(source_config["components"][source_name])
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        self._pipe_configs[name] = pipe_config
-												Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
											
										
										
											2021-10-04 13:19:02 +03:00
+								        if self.vocab.strings != source.vocab.strings:
 								            for s in source.vocab.strings:
 								                self.vocab.strings.add(s)
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        return pipe, pipe_config["factory"]
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    def add_pipe(
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self,
 								        factory_name: str,
 								        name: Optional[str] = None,
 								        *,
 								        before: Optional[Union[str, int]] = None,
 								        after: Optional[Union[str, int]] = None,
 								        first: Optional[bool] = None,
 								        last: Optional[bool] = None,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        source: Optional["Language"] = None,
-												config is not Optional (#9024)


											
										
										
											2021-08-27 12:44:31 +03:00
+								        config: Dict[str, Any] = SimpleFrozenDict(),
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        raw_config: Optional[Config] = None,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        validate: bool = True,
 								    ) -> Callable[[Doc], Doc]:
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """Add a component to the processing pipeline. Valid components are
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								        callables that take a `Doc` object, modify it and return it. Only one
 								        of before/after/first/last can be set. Default behaviour is "last".
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        factory_name (str): Name of the component factory.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of pipeline component. Overwrites existing
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								            component.name attribute if available. If no name is set and
 								            the component exposes no name attribute, component.__name__ is
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								            used. An error is raised if a name already exists in the pipeline.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        before (Union[str, int]): Name or index of the component to insert new
 								            component directly before.
 								        after (Union[str, int]): Name or index of the component to insert new
 								            component directly after.
 								        first (bool): If True, insert component first in the pipeline.
 								        last (bool): If True, insert component last in the pipeline.
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        source (Language): Optional loaded nlp object to copy the pipeline
 								            component from.
-												config is not Optional (#9024)


											
										
										
											2021-08-27 12:44:31 +03:00
+								        config (Dict[str, Any]): Config parameters to use for this component.
 								            Will be merged with default config, if available.
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        raw_config (Optional[Config]): Internals: the non-interpolated config.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        validate (bool): Whether to validate the component config against the
 								            arguments and types expected by the factory.
 								        RETURNS (Callable[[Doc], Doc]): The pipeline component.
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#add_pipe
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if not isinstance(factory_name, str):
 								            bad_val = repr(factory_name)
 								            err = Errors.E966.format(component=bad_val, name=name)
 								            raise ValueError(err)
 								        name = name if name is not None else factory_name
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        if name in self.component_names:
 								            raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        if source is not None:
 								            # We're loading the component from a model. After loading the
 								            # component, we know its real factory name
 								            pipe_component, factory_name = self.create_pipe_from_source(
 								                factory_name, source, name=name
 								            )
 								        else:
 								            if not self.has_factory(factory_name):
 								                err = Errors.E002.format(
 								                    name=factory_name,
 								                    opts=", ".join(self.factory_names),
 								                    method="add_pipe",
 								                    lang=util.get_object_name(self),
 								                    lang_code=self.lang,
 								                )
 								            pipe_component = self.create_pipe(
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								                factory_name,
 								                name=name,
 								                config=config,
 								                raw_config=raw_config,
 								                validate=validate,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								            )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        pipe_index = self._get_pipe_index(before, after, first, last)
 								        self._pipe_meta[name] = self.get_factory_meta(factory_name)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._components.insert(pipe_index, (name, pipe_component))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        return pipe_component
-												Add lookup properties for components in Language

											
										
										
											2017-06-04 23:52:09 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def _get_pipe_index(
 								        self,
 								        before: Optional[Union[str, int]] = None,
 								        after: Optional[Union[str, int]] = None,
 								        first: Optional[bool] = None,
 								        last: Optional[bool] = None,
 								    ) -> int:
 								        """Determine where to insert a pipeline component based on the before/
 								        after/first/last values.
 								        before (str): Name or index of the component to insert directly before.
 								        after (str): Name or index of component to insert directly after.
 								        first (bool): If True, insert component first in the pipeline.
 								        last (bool): If True, insert component last in the pipeline.
 								        RETURNS (int): The index of the new pipeline component.
 								        """
 								        all_args = {"before": before, "after": after, "first": first, "last": last}
 								        if sum(arg is not None for arg in [before, after, first, last]) >= 2:
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								            raise ValueError(
 								                Errors.E006.format(args=all_args, opts=self.component_names)
 								            )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if last or not any(value is not None for value in [first, before, after]):
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								            return len(self._components)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        elif first:
 								            return 0
 								        elif isinstance(before, str):
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								            if before not in self.component_names:
 								                raise ValueError(
 								                    Errors.E001.format(name=before, opts=self.component_names)
 								                )
 								            return self.component_names.index(before)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        elif isinstance(after, str):
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								            if after not in self.component_names:
 								                raise ValueError(
 								                    Errors.E001.format(name=after, opts=self.component_names)
 								                )
 								            return self.component_names.index(after) + 1
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        # We're only accepting indices referring to components that exist
 								        # (can't just do isinstance here because bools are instance of int, too)
 								        elif type(before) == int:
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								            if before >= len(self._components) or before < 0:
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								                err = Errors.E959.format(
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								                    dir="before", idx=before, opts=self.component_names
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								                )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								                raise ValueError(err)
 								            return before
 								        elif type(after) == int:
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								            if after >= len(self._components) or after < 0:
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								                err = Errors.E959.format(
 								                    dir="after", idx=after, opts=self.component_names
 								                )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								                raise ValueError(err)
 								            return after + 1
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
 								    def has_pipe(self, name: str) -> bool:
-												Add Language.has_pipe method

											
										
										
											2017-10-17 12:20:07 +03:00
+								        """Check if a component name is present in the pipeline. Equivalent to
 								        `name in nlp.pipe_names`.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the component.
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								        RETURNS (bool): Whether a component of the name exists in the pipeline.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#has_pipe
-												Add Language.has_pipe method

											
										
										
											2017-10-17 12:20:07 +03:00
+								        """
 								        return name in self.pipe_names
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def replace_pipe(
 								        self,
 								        name: str,
 								        factory_name: str,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        config: Dict[str, Any] = SimpleFrozenDict(),
 								        validate: bool = True,
-												set_kb method for entity_linker

											
										
										
											2020-10-08 11:34:01 +03:00
+								    ) -> Callable[[Doc], Doc]:
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """Replace a component in the pipeline.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the component to replace.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        factory_name (str): Factory name of replacement component.
 								        config (Optional[Dict[str, Any]]): Config parameters to use for this
 								            component. Will be merged with default config, if available.
 								        validate (bool): Whether to validate the component config against the
 								            arguments and types expected by the factory.
-												set_kb method for entity_linker

											
										
										
											2020-10-08 11:34:01 +03:00
+								        RETURNS (Callable[[Doc], Doc]): The new pipeline component.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#replace_pipe
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Update Language.replace_pipe for disabled components (#8729)

* Fix the index where the replacement in inserted to account for
disabled components
* Allow `Language.replace_pipe` to replace disabled components
											
										
										
											2021-07-19 11:06:12 +03:00
+								        if name not in self.component_names:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if hasattr(factory_name, "__call__"):
 								            err = Errors.E968.format(component=repr(factory_name), name=name)
 								            raise ValueError(err)
 								        # We need to delegate to Language.add_pipe here instead of just writing
 								        # to Language.pipeline to make sure the configs are handled correctly
-												Update Language.replace_pipe for disabled components (#8729)

* Fix the index where the replacement in inserted to account for
disabled components
* Allow `Language.replace_pipe` to replace disabled components
											
										
										
											2021-07-19 11:06:12 +03:00
+								        pipe_index = self.component_names.index(name)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self.remove_pipe(name)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        if not len(self._components) or pipe_index == len(self._components):
-												Bugfix in nlp.replace_pipe (#5875)

* bugfix and unit test

* merge two conditions
											
										
										
											2020-08-05 10:30:58 +03:00
+								            # we have no components to insert before/after, or we're replacing the last component
-												set_kb method for entity_linker

											
										
										
											2020-10-08 11:34:01 +03:00
+								            return self.add_pipe(
 								                factory_name, name=name, config=config, validate=validate
 								            )
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        else:
-												set_kb method for entity_linker

											
										
										
											2020-10-08 11:34:01 +03:00
+								            return self.add_pipe(
-												Format

											
										
										
											2020-08-23 22:15:12 +03:00
+								                factory_name,
 								                name=name,
 								                before=pipe_index,
 								                config=config,
 								                validate=validate,
 								            )
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def rename_pipe(self, old_name: str, new_name: str) -> None:
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """Rename a pipeline component.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        old_name (str): Name of the component to rename.
 								        new_name (str): New name of the component.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#rename_pipe
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        if old_name not in self.component_names:
 								            raise ValueError(
 								                Errors.E001.format(name=old_name, opts=self.component_names)
 								            )
 								        if new_name in self.component_names:
 								            raise ValueError(
 								                Errors.E007.format(name=new_name, opts=self.component_names)
 								            )
 								        i = self.component_names.index(old_name)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self._components[i] = (new_name, self._components[i][1])
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
 								        self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
-												Adjust [initialize.components] on Language.remove_pipe and Language.rename_pipe

											
										
										
											2020-10-04 15:43:45 +03:00
+								        # Make sure [initialize] config is adjusted
 								        if old_name in self._config["initialize"]["components"]:
 								            init_cfg = self._config["initialize"]["components"].pop(old_name)
 								            self._config["initialize"]["components"][new_name] = init_cfg
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]:
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """Remove a component from the pipeline.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the component to remove.
-												Update docstrings

											
										
										
											2017-10-07 02:04:50 +03:00
+								        RETURNS (tuple): A `(name, component)` tuple of the removed component.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#remove_pipe
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        """
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        if name not in self.component_names:
 								            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        removed = self._components.pop(self.component_names.index(name))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        # We're only removing the component itself from the metas/configs here
 								        # because factory may be used for something else
 								        self._pipe_meta.pop(name)
 								        self._pipe_configs.pop(name)
-												Fix vectors check for sourced components (#8559)

* Fix vectors check for sourced components

Since vectors are not loaded when components are sourced, store a hash
for the vectors of each sourced component and compare it to the loaded
vectors after the vectors are loaded from the `[initialize]` block.

* Pop temporary info

* Remove stored hash in remove_pipe

* Add default for pop

* Add additional convert/debug/assemble CLI tests
											
										
										
											2021-07-06 13:43:17 +03:00
+								        self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
-												Adjust [initialize.components] on Language.remove_pipe and Language.rename_pipe

											
										
										
											2020-10-04 15:43:45 +03:00
+								        # Make sure name is removed from the [initialize] config
 								        if name in self._config["initialize"]["components"]:
 								            self._config["initialize"]["components"].pop(name)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        # Make sure the name is also removed from the set of disabled components
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        if name in self.disabled:
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
+								            self._disabled.remove(name)
-												Fix pipeline analysis on remove pipe (#4557)

Validate *after* component is removed, not before
											
										
										
											2019-10-30 21:04:17 +03:00
+								        return removed
-												Add lookup properties for components in Language

											
										
										
											2017-06-04 23:52:09 +03:00
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    def disable_pipe(self, name: str) -> None:
 								        """Disable a pipeline component. The component will still exist on
-												Remove todos and update docstrings

											
										
										
											2020-08-28 21:34:46 +03:00
+								        the nlp object, but it won't be run as part of the pipeline. Does
 								        nothing if the component is already disabled.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
 								        name (str): The name of the component to disable.
 								        """
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        if name not in self.component_names:
 								            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
+								        self._disabled.add(name)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
 								    def enable_pipe(self, name: str) -> None:
 								        """Enable a previously disabled pipeline component so it's run as part
-												Remove todos and update docstrings

											
										
										
											2020-08-28 21:34:46 +03:00
+								        of the pipeline. Does nothing if the component is already enabled.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
 								        name (str): The name of the component to enable.
 								        """
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								        if name not in self.component_names:
 								            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
 								        if name in self.disabled:
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
+								            self._disabled.remove(name)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def __call__(
 								        self,
 								        text: str,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        disable: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
 								    ) -> Doc:
-												Fix formatting, tidy up and remove unused imports

											
										
										
											2017-10-07 01:26:05 +03:00
+								        """Apply the pipeline to some text. The text can span multiple sentences,
-												fix: use actual range in 'seen' instead of subtree

											
										
										
											2020-05-21 00:06:39 +03:00
+								        and can contain arbitrary whitespace. Alignment into the original string
-												* Add language base class

											
										
										
											2015-08-25 16:37:17 +03:00
+								        is preserved.
-												Remove trailing whitespace

											
										
										
											2016-12-18 18:54:52 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        text (str): The text to be processed.
-												Use disable argument (list) for serialization

											
										
										
											2017-05-26 13:33:54 +03:00
+								        disable (list): Names of the pipeline components to disable.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        component_cfg (Dict[str, dict]): An optional dictionary with extra
 								            keyword arguments for specific components.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        RETURNS (Doc): A container for accessing the annotations.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#call
-												* Add language base class

											
										
										
											2015-08-25 16:37:17 +03:00
+								        """
-												Break the tokenization stage out of the pipeline into a function 'make_doc'. This allows all pipeline methods to have the same signature.

											
										
										
											2016-10-14 18:38:29 +03:00
+								        doc = self.make_doc(text)
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								        if component_cfg is None:
 								            component_cfg = {}
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        for name, proc in self.pipeline:
-												Use disable argument (list) for serialization

											
										
										
											2017-05-26 13:33:54 +03:00
+								            if name in disable:
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
+								                continue
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            if not hasattr(proc, "__call__"):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E003.format(component=type(proc), name=name))
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								            error_handler = self.default_error_handler
 								            if hasattr(proc, "get_error_handler"):
 								                error_handler = proc.get_error_handler()
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
+								            try:
 								                doc = proc(doc, **component_cfg.get(name, {}))
-												print debugging warning before raising error if model not properly initialized

											
										
										
											2020-10-01 10:21:00 +03:00
+								            except KeyError as e:
-												Raise error from caught KeyError to preserve traceback

											
										
										
											2020-10-03 12:43:56 +03:00
+								                # This typically happens if a component is not initialized
 								                raise ValueError(Errors.E109.format(name=name)) from e
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								            except Exception as e:
 								                error_handler(name, proc, [doc], e)
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            if doc is None:
 								                raise ValueError(Errors.E005.format(name=name))
-												* Change Language class to use a .pipeline attribute, instead of having the pipeline hard coded

											
										
										
											2016-05-17 17:55:42 +03:00
+								        return doc
-												* Add language base class

											
										
										
											2015-08-25 16:37:17 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def disable_pipes(self, *names) -> "DisabledPipes":
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								        """Disable one or more pipeline components. If used as a context
 								        manager, the pipeline will be restored to the initial state at the end
 								        of the block. Otherwise, a DisabledPipes object is returned, that has
 								        a `.restore()` method you can use to undo your changes.
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
+								        This method has been deprecated since 3.0
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								        """
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
+								        warnings.warn(Warnings.W096, DeprecationWarning)
-												Also support passing list to Language.disable_pipes (#4521)

* Also support passing list to Language.disable_pipes

* Adjust internals

											
										
										
											2019-10-25 17:19:08 +03:00
+								        if len(names) == 1 and isinstance(names[0], (list, tuple)):
 								            names = names[0]  # support list of names instead of spread
-												Make deprecated disable_pipes call into select_pipes

											
										
										
											2020-08-29 13:08:46 +03:00
+								        return self.select_pipes(disable=names)
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def select_pipes(
 								        self,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        disable: Optional[Union[str, Iterable[str]]] = None,
 								        enable: Optional[Union[str, Iterable[str]]] = None,
 								    ) -> "DisabledPipes":
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
+								        """Disable one or more pipeline components. If used as a context
 								        manager, the pipeline will be restored to the initial state at the end
 								        of the block. Otherwise, a DisabledPipes object is returned, that has
 								        a `.restore()` method you can use to undo your changes.
 								        disable (str or iterable): The name(s) of the pipes to disable
 								        enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#select_pipes
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
+								        """
 								        if enable is None and disable is None:
 								            raise ValueError(Errors.E991)
 								        if disable is not None and isinstance(disable, str):
 								            disable = [disable]
 								        if enable is not None:
 								            if isinstance(enable, str):
 								                enable = [enable]
 								            to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
 								            # raise an error if the enable and disable keywords are not consistent
 								            if disable is not None and disable != to_disable:
-												default models defined in component decorator (#5452)

* move defaults to pipeline and use in component decorator

* black formatting

* relative import
											
										
										
											2020-05-19 17:20:03 +03:00
+								                raise ValueError(
 								                    Errors.E992.format(
 								                        enable=enable, disable=disable, names=self.pipe_names
 								                    )
 								                )
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
+								            disable = to_disable
-												bugfix DisabledPipes

											
										
										
											2020-10-09 13:06:20 +03:00
+								        # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
 								        # those pipes that were already disabled.
 								        disable = [d for d in disable if d not in self._disabled]
-												Feature toggle_pipes (#5378)

* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-18 23:27:10 +03:00
+								        return DisabledPipes(self, disable)
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def make_doc(self, text: str) -> Doc:
 								        """Turn a text into a Doc object.
 								        text (str): The text to process.
 								        RETURNS (Doc): The processed doc.
 								        """
-												Move max_length to nlp.make_doc() (#6512)

Move max_length check to `nlp.make_doc()` so that's it's also checked
for `nlp.pipe()`.
											
										
										
											2020-12-08 09:24:02 +03:00
+								        if len(text) > self.max_length:
 								            raise ValueError(
 								                Errors.E088.format(length=len(text), max_length=self.max_length)
 								            )
-												Work on serialization

											
										
										
											2017-05-29 16:40:45 +03:00
+								        return self.tokenizer(text)
-												Merge branch 'develop' into master-tmp

											
										
										
											2020-05-21 19:39:06 +03:00
+								    def update(
 								        self,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        examples: Iterable[Example],
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        _: Optional[Any] = None,
-												Merge branch 'develop' into master-tmp

											
										
										
											2020-05-21 19:39:06 +03:00
+								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        drop: float = 0.0,
 								        sgd: Optional[Optimizer] = None,
 								        losses: Optional[Dict[str, float]] = None,
 								        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        exclude: Iterable[str] = SimpleFrozenList(),
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								        annotates: Iterable[str] = SimpleFrozenList(),
-												Merge branch 'develop' into master-tmp

											
										
										
											2020-05-21 19:39:06 +03:00
+								    ):
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """Update the models in the pipeline.
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        examples (Iterable[Example]): A batch of examples
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        _: Should not be set - serves to catch backwards-incompatible scripts.
-												KB extensions and better parsing of WikiData (#4375)

* fix overflow error on windows

* more documentation & logging fixes

* md fix

* 3 different limit parameters to play with execution time

* bug fixes directory locations

* small fixes

* exclude dev test articles from prior probabilities stats

* small fixes

* filtering wikidata entities, removing numeric and meta items

* adding aliases from wikidata also to the KB

* fix adding WD aliases

* adding also new aliases to previously added entities

* fixing comma's

* small doc fixes

* adding subclassof filtering

* append alias functionality in KB

* prevent appending the same entity-alias pair

* fix for appending WD aliases

* remove date filter

* remove unnecessary import

* small corrections and reformatting

* remove WD aliases for now (too slow)

* removing numeric entities from training and evaluation

* small fixes

* shortcut during prediction if there is only one candidate

* add counts and fscore logging, remove FP NER from evaluation

* fix entity_linker.predict to take docs instead of single sentences

* remove enumeration sentences from the WP dataset

* entity_linker.update to process full doc instead of single sentence

* spelling corrections and dump locations in readme

* NLP IO fix

* reading KB is unnecessary at the end of the pipeline

* small logging fix

* remove empty files

											
										
										
											2019-10-14 13:28:53 +03:00
+								        drop (float): The dropout rate.
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        sgd (Optimizer): An optimizer.
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
 								            component.
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
-												Update Language.update docs

											
										
										
											2019-05-24 15:06:26 +03:00
+								            components, keyed by component name.
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        exclude (Iterable[str]): Names of components that shouldn't be updated.
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								        annotates (Iterable[str]): Names of components that should set
 								            annotations on the predicted examples after updating.
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        RETURNS (Dict[str, float]): The updated losses dictionary
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#update
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        if _ is not None:
-												Various fixes to NEL functionality, Example class etc (#5460)

* setting KB in the EL constructor, similar to how the model is passed on

* removing wikipedia example files - moved to projects

* throw an error when nlp.update is called with 2 positional arguments

* rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config

* update config files with new parameters

* avoid training pipeline components that don't have a model (like sentencizer)

* various small fixes + UX improvements

* small fixes

* set thinc to 8.0.0a9 everywhere

* remove outdated comment
											
										
										
											2020-05-20 12:41:12 +03:00
+								            raise ValueError(Errors.E989)
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        if losses is None:
 								            losses = {}
-												Example class for training data (#4543)

* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script

											
										
										
											2019-11-11 19:35:27 +03:00
+								        if len(examples) == 0:
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								            return losses
-												Tidy up pipes (#5906)

* Tidy up pipes

* Fix init, defaults and raise custom errors

* Update docs

* Update docs [ci skip]

* Apply suggestions from code review

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Tidy up error handling and validation, fix consistency

* Simplify get_examples check

* Remove unused import [ci skip]

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-12 00:29:31 +03:00
+								        validate_examples(examples, "Language.update")
-												Copy the Example objects (and their predicted Doc) in nlp.evaluate() and nlp.update() (#6765)

* Make copy of examples in nlp.update and nlp.evaluate

* Avoid circular import

* Fix evaluate
											
										
										
											2021-01-19 18:47:44 +03:00
+								        examples = _copy_examples(examples)
-												Add optimizer in Language.update if sgd=None

											
										
										
											2017-08-20 15:42:07 +03:00
+								        if sgd is None:
 								            if self._optimizer is None:
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								                self._optimizer = self.create_optimizer()
-												Add optimizer in Language.update if sgd=None

											
										
										
											2017-08-20 15:42:07 +03:00
+								            sgd = self._optimizer
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								        if component_cfg is None:
 								            component_cfg = {}
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								        pipe_kwargs = {}
-												Guess set_annotations=True in nlp.update

During `nlp.update`, components can be passed a boolean set_annotations
to indicate whether they should assign annotations to the `Doc`. This
needs to be called if downstream components expect to use the
annotations during training, e.g. if we wanted to use tagger features in
the parser.

Components can specify their assignments and requirements, so we can
figure out which components have these inter-dependencies. After
figuring this out, we can guess whether to pass set_annotations=True.

We could also call set_annotations=True always, or even just have this
as the only behaviour. The downside of this is that it would require the
`Doc` objects to be created afresh to avoid problematic modifications.
One approach would be to make a fresh copy of the `Doc` objects within
`nlp.update()`, so that we can write to the objects without any
problems. If we do that, we can drop this logic and also drop the
`set_annotations` mechanism. I would be fine with that approach,
although it runs the risk of introducing some performance overhead, and
we'll have to take care to copy all extension attributes etc.

											
										
										
											2020-05-22 16:55:45 +03:00
+								        for i, (name, proc) in enumerate(self.pipeline):
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								            component_cfg.setdefault(name, {})
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								            pipe_kwargs[name] = deepcopy(component_cfg[name])
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								            component_cfg[name].setdefault("drop", drop)
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        for name, proc in self.pipeline:
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								            if name not in exclude and hasattr(proc, "update"):
 								                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
 								            if sgd not in (None, False):
-												Tidy up pipes (#5906)

* Tidy up pipes

* Fix init, defaults and raise custom errors

* Update docs

* Update docs [ci skip]

* Apply suggestions from code review

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Tidy up error handling and validation, fix consistency

* Simplify get_examples check

* Remove unused import [ci skip]

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-12 00:29:31 +03:00
+								                if (
 								                    name not in exclude
-												is_trainable method

											
										
										
											2020-10-05 18:43:42 +03:00
+								                    and hasattr(proc, "is_trainable")
-												TrainablePipe (#6213)

* rename Pipe to TrainablePipe

* split functionality between Pipe and TrainablePipe

* remove unnecessary methods from certain components

* cleanup

* hasattr(component, "pipe") should be sufficient again

* remove serialization and vocab/cfg from Pipe

* unify _ensure_examples and validate_examples

* small fixes

* hasattr checks for self.cfg and self.vocab

* make is_resizable and is_trainable properties

* serialize strings.json instead of vocab

* fix KB IO + tests

* fix typos

* more typos

* _added_strings as a set

* few more tests specifically for _added_strings field

* bump to 3.0.0a36
											
										
										
											2020-10-08 22:33:49 +03:00
+								                    and proc.is_trainable
-												Tidy up pipes (#5906)

* Tidy up pipes

* Fix init, defaults and raise custom errors

* Update docs

* Update docs [ci skip]

* Apply suggestions from code review

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Tidy up error handling and validation, fix consistency

* Simplify get_examples check

* Remove unused import [ci skip]

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-12 00:29:31 +03:00
+								                    and proc.model not in (True, False, None)
 								                ):
-												add finish_update to Pipe

											
										
										
											2020-10-05 17:23:33 +03:00
+								                    proc.finish_update(sgd)
-												Add training option to set annotations on update (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
											
										
										
											2021-04-26 17:53:53 +03:00
+								            if name in annotates:
 								                for doc, eg in zip(
 								                    _pipe(
 								                        (eg.predicted for eg in examples),
 								                        proc=proc,
 								                        name=name,
 								                        default_error_handler=self.default_error_handler,
 								                        kwargs=pipe_kwargs[name],
 								                    ),
 								                    examples,
 								                ):
 								                    eg.predicted = doc
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        return losses
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def rehearse(
 								        self,
 								        examples: Iterable[Example],
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        sgd: Optional[Optimizer] = None,
 								        losses: Optional[Dict[str, float]] = None,
 								        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        exclude: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ) -> Dict[str, float]:
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        """Make a "rehearsal" update to the models in the pipeline, to prevent
 								        forgetting. Rehearsal updates run an initial copy of the model over some
 								        data, and update the model so its current predictions are more like the
-												Use consistent spelling

											
										
										
											2019-10-02 11:37:39 +03:00
+								        initial ones. This is useful for keeping a pretrained model on-track,
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        even if you're updating it with a smaller set of examples.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        examples (Iterable[Example]): A batch of `Example` objects.
 								        sgd (Optional[Optimizer]): An optimizer.
 								        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
 								            components, keyed by component name.
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        exclude (Iterable[str]): Names of components that shouldn't be updated.
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        RETURNS (dict): Results from the update.
 								        EXAMPLE:
 								            >>> raw_text_batches = minibatch(raw_texts)
-												Feature/example only (#5707)

* remove _convert_examples

* fix test_gold, raise TypeError if tuples are used instead of Example's

* throwing proper errors when the wrong type of objects are passed

* fix deprectated format in tests

* fix deprectated format in parser tests

* fix tests for NEL, morph, senter, tagger, textcat

* update regression tests with new Example format

* use make_doc

* more fixes to nlp.update calls

* few more small fixes for rehearse and evaluate

* only import ml_datasets if really necessary
											
										
										
											2020-07-06 14:02:36 +03:00
+								            >>> for labelled_batch in minibatch(examples):
-												Example class for training data (#4543)

* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script

											
										
										
											2019-11-11 19:35:27 +03:00
+								            >>>     nlp.update(labelled_batch)
-												Feature/example only (#5707)

* remove _convert_examples

* fix test_gold, raise TypeError if tuples are used instead of Example's

* throwing proper errors when the wrong type of objects are passed

* fix deprectated format in tests

* fix deprectated format in parser tests

* fix tests for NEL, morph, senter, tagger, textcat

* update regression tests with new Example format

* use make_doc

* more fixes to nlp.update calls

* few more small fixes for rehearse and evaluate

* only import ml_datasets if really necessary
											
										
										
											2020-07-06 14:02:36 +03:00
+								            >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								            >>>     nlp.rehearse(raw_batch)
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#rehearse
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        """
-												Example class for training data (#4543)

* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script

											
										
										
											2019-11-11 19:35:27 +03:00
+								        if len(examples) == 0:
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								            return
-												Tidy up pipes (#5906)

* Tidy up pipes

* Fix init, defaults and raise custom errors

* Update docs

* Update docs [ci skip]

* Apply suggestions from code review

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Tidy up error handling and validation, fix consistency

* Simplify get_examples check

* Remove unused import [ci skip]

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-12 00:29:31 +03:00
+								        validate_examples(examples, "Language.rehearse")
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        if sgd is None:
 								            if self._optimizer is None:
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								                self._optimizer = self.create_optimizer()
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								            sgd = self._optimizer
 								        pipes = list(self.pipeline)
 								        random.shuffle(pipes)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if component_cfg is None:
 								            component_cfg = {}
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        grads = {}
 								        def get_grads(W, dW, key=None):
 								            grads[key] = (W, dW)
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        get_grads.learn_rate = sgd.learn_rate
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        get_grads.b1 = sgd.b1
 								        get_grads.b2 = sgd.b2
 								        for name, proc in pipes:
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								            if name in exclude or not hasattr(proc, "rehearse"):
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								                continue
 								            grads = {}
-												Small fixes to as_example (#4957)

* label in span not writable anymore

* Revert "label in span not writable anymore"

This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090.

* fixing yield - remove redundant list

											
										
										
											2020-02-03 15:02:12 +03:00
+								            proc.rehearse(
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								                examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
-												Small fixes to as_example (#4957)

* label in span not writable anymore

* Revert "label in span not writable anymore"

This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090.

* fixing yield - remove redundant list

											
										
										
											2020-02-03 15:02:12 +03:00
+								            )
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        for key, (W, dW) in grads.items():
 								            sgd(W, dW, key=key)
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        return losses
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def begin_training(
 								        self,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
 								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        sgd: Optional[Optimizer] = None,
-												begin_training -> initialize

											
										
										
											2020-09-28 22:35:09 +03:00
+								    ) -> Optimizer:
 								        warnings.warn(Warnings.W089, DeprecationWarning)
-												Clean up

											
										
										
											2020-09-29 13:14:08 +03:00
+								        return self.initialize(get_examples, sgd=sgd)
-												begin_training -> initialize

											
										
										
											2020-09-28 22:35:09 +03:00
 								    def initialize(
 								        self,
 								        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
 								        *,
 								        sgd: Optional[Optimizer] = None,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ) -> Optimizer:
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        """Initialize the pipe for training, using data examples if available.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        get_examples (Callable[[], Iterable[Example]]): Optional function that
 								            returns gold-standard Example objects.
-												Clean up

											
										
										
											2020-09-29 13:14:08 +03:00
+								        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								            provided, will be created using the .create_optimizer() method.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        RETURNS (thinc.api.Optimizer): The optimizer.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#initialize
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """
-												Example class for training data (#4543)

* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script

											
										
										
											2019-11-11 19:35:27 +03:00
+								        if get_examples is None:
-												Pipe API (#6034)

* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
											
										
										
											2020-09-08 23:44:25 +03:00
+								            util.logger.debug(
-												begin_training -> initialize

											
										
										
											2020-09-28 22:35:09 +03:00
+								                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
-												Pipe API (#6034)

* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
											
										
										
											2020-09-08 23:44:25 +03:00
+								            )
 								            doc = Doc(self.vocab, words=["x", "y", "z"])
 								            get_examples = lambda: [Example.from_dict(doc, {})]
 								        if not hasattr(get_examples, "__call__"):
-												Tidy up and auto-format

											
										
										
											2020-10-10 20:14:48 +03:00
+								            err = Errors.E930.format(
 								                method="Language.initialize", obj=type(get_examples)
 								            )
-												TrainablePipe (#6213)

* rename Pipe to TrainablePipe

* split functionality between Pipe and TrainablePipe

* remove unnecessary methods from certain components

* cleanup

* hasattr(component, "pipe") should be sufficient again

* remove serialization and vocab/cfg from Pipe

* unify _ensure_examples and validate_examples

* small fixes

* hasattr checks for self.cfg and self.vocab

* make is_resizable and is_trainable properties

* serialize strings.json instead of vocab

* fix KB IO + tests

* fix typos

* more typos

* _added_strings as a set

* few more tests specifically for _added_strings field

* bump to 3.0.0a36
											
										
										
											2020-10-08 22:33:49 +03:00
+								            raise TypeError(err)
-												Simplify config use in Language.initialize

											
										
										
											2020-09-29 17:05:48 +03:00
+								        # Make sure the config is interpolated so we can resolve subsections
 								        config = self.config.interpolate()
 								        # These are the settings provided in the [initialize] block in the config
 								        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-												Add initialize.before_init and after_init callbacks

Add `initialize.before_init` and `initialize.after_init` callbacks to
the config. The `initialize.before_init` callback is a place to
implement one-time tokenizer customizations that are then saved with the
model.

											
										
										
											2021-01-12 13:29:31 +03:00
+								        before_init = I["before_init"]
 								        if before_init is not None:
 								            before_init(self)
-												return custom error in nlp.initialize (#7104)

* return custom error in nlp.initialize

* Rename error

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2021-03-09 15:01:31 +03:00
+								        try:
 								            init_vocab(
 								                self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
 								            )
 								        except IOError:
 								            raise IOError(Errors.E884.format(vectors=I["vectors"]))
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								        if self.vocab.vectors.data.shape[1] >= 1:
 								            ops = get_current_ops()
 								            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-												Update Language.initialize and support components/tokenizer settings

											
										
										
											2020-09-29 12:52:45 +03:00
+								        if hasattr(self.tokenizer, "initialize"):
 								            tok_settings = validate_init_settings(
 								                self.tokenizer.initialize,
-												Simplify config use in Language.initialize

											
										
										
											2020-09-29 17:05:48 +03:00
+								                I["tokenizer"],
-												Update Language.initialize and support components/tokenizer settings

											
										
										
											2020-09-29 12:52:45 +03:00
+								                section="tokenizer",
 								                name="tokenizer",
 								            )
 								            self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        for name, proc in self.pipeline:
-												begin_training -> initialize

											
										
										
											2020-09-28 22:35:09 +03:00
+								            if hasattr(proc, "initialize"):
-												Simplify config use in Language.initialize

											
										
										
											2020-09-29 17:05:48 +03:00
+								                p_settings = I["components"].get(name, {})
-												Update Language.initialize and support components/tokenizer settings

											
										
										
											2020-09-29 12:52:45 +03:00
+								                p_settings = validate_init_settings(
 								                    proc.initialize, p_settings, section="components", name=name
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								                )
-												Pass nlp forward

											
										
										
											2020-09-29 13:21:52 +03:00
+								                proc.initialize(get_examples, nlp=self, **p_settings)
-												Fixing pretrain (#7342)

* initialize NLP with train corpus

* add more pretraining tests

* more tests

* function to fetch tok2vec layer for pretraining

* clarify parameter name

* test different objectives

* formatting

* fix check for static vectors when using vectors objective

* clarify docs

* logger statement

* fix init_tok2vec and proc.initialize order

* test training after pretraining

* add init_config tests for pretraining

* pop pretraining block to avoid config validation errors

* custom errors
											
										
										
											2021-03-09 06:01:13 +03:00
+								        pretrain_cfg = config.get("pretraining")
 								        if pretrain_cfg:
 								            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
 								            init_tok2vec(self, P, I)
-												remove link_components flag again (#6883)


											
										
										
											2021-02-02 05:08:40 +03:00
+								        self._link_components()
-												Simplify config use in Language.initialize

											
										
										
											2020-09-29 17:05:48 +03:00
+								        self._optimizer = sgd
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								        if sgd is not None:
 								            self._optimizer = sgd
 								        elif self._optimizer is None:
 								            self._optimizer = self.create_optimizer()
-												Add initialize.before_init and after_init callbacks

Add `initialize.before_init` and `initialize.after_init` callbacks to
the config. The `initialize.before_init` callback is a place to
implement one-time tokenizer customizations that are then saved with the
model.

											
										
										
											2021-01-12 13:29:31 +03:00
+								        after_init = I["after_init"]
 								        if after_init is not None:
 								            after_init(self)
-												Add optimizer in Language.update if sgd=None

											
										
										
											2017-08-20 15:42:07 +03:00
+								        return self._optimizer
-												Refactor training, to fix memory leak

											
										
										
											2017-05-21 17:07:06 +03:00
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
-												Use consistent spelling

											
										
										
											2019-10-02 11:37:39 +03:00
+								        """Continue training a pretrained model.
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 15:48:10 +03:00
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        Create and return an optimizer, and initialize "rehearsal" for any pipeline
 								        component that has a .rehearse() method. Rehearsal is used to prevent
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        models from "forgetting" their initialized "knowledge". To perform
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        rehearsal, collect samples of text you want the models to retain performance
-												Feature/example only (#5707)

* remove _convert_examples

* fix test_gold, raise TypeError if tuples are used instead of Example's

* throwing proper errors when the wrong type of objects are passed

* fix deprectated format in tests

* fix deprectated format in parser tests

* fix tests for NEL, morph, senter, tagger, textcat

* update regression tests with new Example format

* use make_doc

* more fixes to nlp.update calls

* few more small fixes for rehearse and evaluate

* only import ml_datasets if really necessary
											
										
										
											2020-07-06 14:02:36 +03:00
+								        on, and call nlp.rehearse() with a batch of Example objects.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
 								        RETURNS (Optimizer): The optimizer.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#resume_training
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        """
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								        ops = get_current_ops()
 								        if self.vocab.vectors.data.shape[1] >= 1:
 								            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        for name, proc in self.pipeline:
 								            if hasattr(proc, "_rehearsal_model"):
 								                proc._rehearsal_model = deepcopy(proc.model)
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								        if sgd is not None:
 								            self._optimizer = sgd
 								        elif self._optimizer is None:
 								            self._optimizer = self.create_optimizer()
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        return self._optimizer
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								    def set_error_handler(
 								        self,
 								        error_handler: Callable[
 								            [str, Callable[[Doc], Doc], List[Doc], Exception], None
 								        ],
 								    ):
 								        """Set an error handler object for all the components in the pipeline that implement
 								        a set_error_handler function.
 								        error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]):
 								            Function that deals with a failing batch of documents. This callable function should take in
 								            the component's name, the component itself, the offending batch of documents, and the exception
 								            that was thrown.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#set_error_handler
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								        """
 								        self.default_error_handler = error_handler
 								        for name, pipe in self.pipeline:
 								            if hasattr(pipe, "set_error_handler"):
 								                pipe.set_error_handler(error_handler)
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								    def evaluate(
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        self,
 								        examples: Iterable[Example],
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        batch_size: Optional[int] = None,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        scorer: Optional[Scorer] = None,
 								        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
-												Move Language.evaluate scorer config to new arg

Move `Language.evaluate` scorer config from `component_cfg` to separate
argument `scorer_cfg`.

											
										
										
											2020-07-31 12:02:17 +03:00
+								        scorer_cfg: Optional[Dict[str, Any]] = None,
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								    ) -> Dict[str, Union[float, dict]]:
-												Document Language.evaluate

											
										
										
											2019-05-24 15:06:36 +03:00
+								        """Evaluate a model's pipeline components.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        examples (Iterable[Example]): `Example` objects.
-												Update docstring for Language.evaluate

											
										
										
											2020-12-09 12:21:39 +03:00
+								        batch_size (Optional[int]): Batch size to use.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
-												Document Language.evaluate

											
										
										
											2019-05-24 15:06:36 +03:00
+								            will be created.
 								        component_cfg (dict): An optional dictionary with extra keyword
 								            arguments for specific components.
-												Move Language.evaluate scorer config to new arg

Move `Language.evaluate` scorer config from `component_cfg` to separate
argument `scorer_cfg`.

											
										
										
											2020-07-31 12:02:17 +03:00
+								        scorer_cfg (dict): An optional dictionary with extra keyword arguments
 								            for the scorer.
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
-												Document Language.evaluate

											
										
										
											2019-05-24 15:06:36 +03:00
+								        RETURNS (Scorer): The scorer containing the evaluation results.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#evaluate
-												Document Language.evaluate

											
										
										
											2019-05-24 15:06:36 +03:00
+								        """
-												Fix memory issues in Language.evaluate (#6386)

* Fix memory issues in Language.evaluate

Reset annotation in predicted docs before evaluating and store all data
in `examples`.

* Minor refactor to docs generator init

* Fix generator expression

* Fix final generator check

* Refactor pipeline loop

* Handle examples generator in Language.evaluate

* Add test with generator

* Use make_doc
											
										
										
											2020-12-31 02:45:50 +03:00
+								        examples = list(examples)
-												Tidy up pipes (#5906)

* Tidy up pipes

* Fix init, defaults and raise custom errors

* Update docs

* Update docs [ci skip]

* Apply suggestions from code review

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Tidy up error handling and validation, fix consistency

* Simplify get_examples check

* Remove unused import [ci skip]

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-12 00:29:31 +03:00
+								        validate_examples(examples, "Language.evaluate")
-												Copy the Example objects (and their predicted Doc) in nlp.evaluate() and nlp.update() (#6765)

* Make copy of examples in nlp.update and nlp.evaluate

* Avoid circular import

* Fix evaluate
											
										
										
											2021-01-19 18:47:44 +03:00
+								        examples = _copy_examples(examples)
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        if batch_size is None:
 								            batch_size = self.batch_size
-												Fix Language.evaluate

											
										
										
											2019-03-15 17:20:09 +03:00
+								        if component_cfg is None:
 								            component_cfg = {}
-												Move Language.evaluate scorer config to new arg

Move `Language.evaluate` scorer config from `component_cfg` to separate
argument `scorer_cfg`.

											
										
										
											2020-07-31 12:02:17 +03:00
+								        if scorer_cfg is None:
 								            scorer_cfg = {}
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								        if scorer is None:
-												Move Language.evaluate scorer config to new arg

Move `Language.evaluate` scorer config from `component_cfg` to separate
argument `scorer_cfg`.

											
										
										
											2020-07-31 12:02:17 +03:00
+								            kwargs = dict(scorer_cfg)
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 13:53:02 +03:00
+								            kwargs.setdefault("nlp", self)
 								            scorer = Scorer(**kwargs)
-												Fix memory issues in Language.evaluate (#6386)

* Fix memory issues in Language.evaluate

Reset annotation in predicted docs before evaluating and store all data
in `examples`.

* Minor refactor to docs generator init

* Fix generator expression

* Fix final generator check

* Refactor pipeline loop

* Handle examples generator in Language.evaluate

* Add test with generator

* Use make_doc
											
										
										
											2020-12-31 02:45:50 +03:00
+								        # reset annotation in predicted docs and time tokenization
-												Move timing into Language.evaluate (#5836)

Move timing into `Language.evaluate` so that only the processing is
timing, not processing + scoring. `Language.evaluate` returns
`scores["speed"]` as words per second, which should be identical to how
the speed was added to the scores previously. Also add the speed to the
evaluate CLI output.
											
										
										
											2020-07-29 12:02:31 +03:00
+								        start_time = timer()
-												Restore tokenization timing in Language.evaluate (#9305)

Restore tokenization timing steps that were accidentally removed in #6765.
											
										
										
											2021-09-27 21:44:14 +03:00
+								        # this is purely for timing
 								        for eg in examples:
 								            self.make_doc(eg.reference.text)
-												Fix memory issues in Language.evaluate (#6386)

* Fix memory issues in Language.evaluate

Reset annotation in predicted docs before evaluating and store all data
in `examples`.

* Minor refactor to docs generator init

* Fix generator expression

* Fix final generator check

* Refactor pipeline loop

* Handle examples generator in Language.evaluate

* Add test with generator

* Use make_doc
											
										
										
											2020-12-31 02:45:50 +03:00
+								        # apply all pipeline components
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        for name, pipe in self.pipeline:
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								            kwargs = component_cfg.get(name, {})
 								            kwargs.setdefault("batch_size", batch_size)
-												Fix memory issues in Language.evaluate (#6386)

* Fix memory issues in Language.evaluate

Reset annotation in predicted docs before evaluating and store all data
in `examples`.

* Minor refactor to docs generator init

* Fix generator expression

* Fix final generator check

* Refactor pipeline loop

* Handle examples generator in Language.evaluate

* Add test with generator

* Use make_doc
											
										
										
											2020-12-31 02:45:50 +03:00
+								            for doc, eg in zip(
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								                _pipe(
 								                    (eg.predicted for eg in examples),
 								                    proc=pipe,
 								                    name=name,
 								                    default_error_handler=self.default_error_handler,
 								                    kwargs=kwargs,
 								                ),
 								                examples,
-												Fix memory issues in Language.evaluate (#6386)

* Fix memory issues in Language.evaluate

Reset annotation in predicted docs before evaluating and store all data
in `examples`.

* Minor refactor to docs generator init

* Fix generator expression

* Fix final generator check

* Refactor pipeline loop

* Handle examples generator in Language.evaluate

* Add test with generator

* Use make_doc
											
										
										
											2020-12-31 02:45:50 +03:00
+								            ):
 								                eg.predicted = doc
-												Move timing into Language.evaluate (#5836)

Move timing into `Language.evaluate` so that only the processing is
timing, not processing + scoring. `Language.evaluate` returns
`scores["speed"]` as words per second, which should be identical to how
the speed was added to the scores previously. Also add the speed to the
evaluate CLI output.
											
										
										
											2020-07-29 12:02:31 +03:00
+								        end_time = timer()
 								        results = scorer.score(examples)
-												Fix memory issues in Language.evaluate (#6386)

* Fix memory issues in Language.evaluate

Reset annotation in predicted docs before evaluating and store all data
in `examples`.

* Minor refactor to docs generator init

* Fix generator expression

* Fix final generator check

* Refactor pipeline loop

* Handle examples generator in Language.evaluate

* Add test with generator

* Use make_doc
											
										
										
											2020-12-31 02:45:50 +03:00
+								        n_words = sum(len(eg.predicted) for eg in examples)
-												Move timing into Language.evaluate (#5836)

Move timing into `Language.evaluate` so that only the processing is
timing, not processing + scoring. `Language.evaluate` returns
`scores["speed"]` as words per second, which should be identical to how
the speed was added to the scores previously. Also add the speed to the
evaluate CLI output.
											
										
										
											2020-07-29 12:02:31 +03:00
+								        results["speed"] = n_words / (end_time - start_time)
 								        return results
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
-												Remove 'device' argument from Language, clean up 'sgd' arg

											
										
										
											2020-09-29 12:42:19 +03:00
+								    def create_optimizer(self):
 								        """Create an optimizer, usually using the [training.optimizer] config."""
-												Clean up sgd

											
										
										
											2020-09-29 13:00:08 +03:00
+								        subconfig = {"optimizer": self.config["training"]["optimizer"]}
 								        return registry.resolve(subconfig)["optimizer"]
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
-												Fix GPU usage in Language

											
										
										
											2017-05-18 12:25:19 +03:00
+								    @contextmanager
-												Let Langugae.use_params work with falsey inputs

The Language.use_params method was failing if you passed in None, which
meant we had to use awkward conditionals for the parameter averaging.
This solves the problem.

											
										
										
											2020-09-03 13:51:04 +03:00
+								    def use_params(self, params: Optional[dict]):
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """Replace weights of models in the pipeline with those provided in the
 								        params dictionary. Can be used as a contextmanager, in which case,
 								        models go back to their original weights after the block.
 								        params (dict): A dictionary of parameters keyed by model ID.
 								        EXAMPLE:
 								            >>> with nlp.use_params(optimizer.averages):
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								            >>>     nlp.to_disk("/tmp/checkpoint")
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#use_params
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """
-												Let Langugae.use_params work with falsey inputs

The Language.use_params method was failing if you passed in None, which
meant we had to use awkward conditionals for the parameter averaging.
This solves the problem.

											
										
										
											2020-09-03 13:51:04 +03:00
+								        if not params:
 								            yield
 								        else:
 								            contexts = [
 								                pipe.use_params(params)
 								                for name, pipe in self.pipeline
 								                if hasattr(pipe, "use_params") and hasattr(pipe, "model")
 								            ]
 								            # TODO: Having trouble with contextlib
 								            # Workaround: these aren't actually context managers atm.
 								            for context in contexts:
 								                try:
 								                    next(context)
 								                except StopIteration:
 								                    pass
 								            yield
 								            for context in contexts:
 								                try:
 								                    next(context)
 								                except StopIteration:
 								                    pass
-												Fix GPU usage in Language

											
										
										
											2017-05-18 12:25:19 +03:00
-												Add the right return type for Language.pipe and an overload for the as_tuples case (#8441)

* Add the right return type for Language.pipe and an overload for the as_tuples version

* Reformat, tidy up

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
											
										
										
											2021-07-06 15:18:40 +03:00
+								    _AnyContext = TypeVar("_AnyContext")
 								    @overload
 								    def pipe(
 								        self,
 								        texts: Iterable[Tuple[str, _AnyContext]],
 								        *,
 								        as_tuples: bool = ...,
 								        batch_size: Optional[int] = ...,
 								        disable: Iterable[str] = ...,
 								        component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
 								        n_process: int = ...,
 								    ) -> Iterator[Tuple[Doc, _AnyContext]]:
 								        ...
-												Tidy up and auto-format

											
										
										
											2021-07-18 08:44:56 +03:00
+								    def pipe(  # noqa: F811
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        self,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        texts: Iterable[str],
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        as_tuples: bool = False,
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        batch_size: Optional[int] = None,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        disable: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
 								        n_process: int = 1,
-												Add the right return type for Language.pipe and an overload for the as_tuples case (#8441)

* Add the right return type for Language.pipe and an overload for the as_tuples version

* Reformat, tidy up

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
											
										
										
											2021-07-06 15:18:40 +03:00
+								    ) -> Iterator[Doc]:
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								        """Process texts as a stream, and yield `Doc` objects in order.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        texts (Iterable[str]): A sequence of texts to process.
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
+								        as_tuples (bool): If set to True, inputs should be a sequence of
-												Document as_tuples keyword arg of Language.pipe

											
										
										
											2017-08-19 13:21:33 +03:00
+								            (text, context) tuples. Output will then be a sequence of
 								            (doc, context) tuples. Defaults to False.
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        batch_size (Optional[int]): The number of texts to buffer.
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        disable (List[str]): Names of the pipeline components to disable.
 								        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
-												Tidy up docstrings

											
										
										
											2019-03-15 18:23:17 +03:00
+								            arguments for specific components.
-												cleanup components API (#5726)

* add keyword separator for update functions and drop unused "state"

* few more Example tests and various small fixes

* consistently return losses after update call

* eliminate unused tensors field across pipe components

* fix name

* fix arg name
											
										
										
											2020-07-09 20:43:39 +03:00
+								        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        YIELDS (Doc): Documents in the order of the original text.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#pipe
-												Use consistent formatting for docstrings

											
										
										
											2017-04-15 12:59:21 +03:00
+								        """
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        if n_process == -1:
 								            n_process = mp.cpu_count()
-												Document as_tuples keyword arg of Language.pipe

											
										
										
											2017-08-19 13:21:33 +03:00
+								        if as_tuples:
-												Update text classification model

											
										
										
											2017-07-25 19:57:59 +03:00
+								            text_context1, text_context2 = itertools.tee(texts)
 								            texts = (tc[0] for tc in text_context1)
 								            contexts = (tc[1] for tc in text_context2)
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            docs = self.pipe(
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								                texts,
 								                batch_size=batch_size,
 								                disable=disable,
-												Fix multiprocessing for as_tuples=True (#4582)


											
										
										
											2019-11-04 22:29:03 +03:00
+								                n_process=n_process,
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								                component_cfg=component_cfg,
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            )
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								            for doc, context in zip(docs, contexts):
-												Update text classification model

											
										
										
											2017-07-25 19:57:59 +03:00
+								                yield (doc, context)
 								            return
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								        if component_cfg is None:
 								            component_cfg = {}
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        if batch_size is None:
 								            batch_size = self.batch_size
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
 								        pipes = (
 								            []
-												serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos

											
										
										
											2020-01-06 16:57:34 +03:00
+								        )  # contains functools.partial objects to easily create multiprocess worker.
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								        for name, proc in self.pipeline:
-												Use disable argument (list) for serialization

											
										
										
											2017-05-26 13:33:54 +03:00
+								            if name in disable:
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
+								                continue
-												💫 Allow passing of config parameters to specific pipeline components (#3386)

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training

											
										
										
											2019-03-11 01:36:47 +03:00
+								            kwargs = component_cfg.get(name, {})
 								            # Allow component_cfg to overwrite the top-level kwargs.
 								            kwargs.setdefault("batch_size", batch_size)
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								            f = functools.partial(
 								                _pipe,
 								                proc=proc,
 								                name=name,
 								                kwargs=kwargs,
 								                default_error_handler=self.default_error_handler,
 								            )
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								            pipes.append(f)
 								        if n_process != 1:
 								            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
 								        else:
 								            # if n_process == 1, no processes are forked.
 								            docs = (self.make_doc(text) for text in texts)
 								            for pipe in pipes:
 								                docs = pipe(docs)
-												Remove the state argument from Language

											
										
										
											2017-05-19 21:25:42 +03:00
+								        for doc in docs:
-												* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading.

											
										
										
											2016-02-03 04:04:55 +03:00
+								            yield doc
-												* Add Language.batch() method, to support multi-threaded jobs

											
										
										
											2016-02-01 11:01:13 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def _multiprocessing_pipe(
 								        self,
 								        texts: Iterable[str],
 								        pipes: Iterable[Callable[[Doc], Doc]],
 								        n_process: int,
 								        batch_size: int,
 								    ) -> None:
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        # raw_texts is used later to stop iteration.
 								        texts, raw_texts = itertools.tee(texts)
 								        # for sending texts to worker
 								        texts_q = [mp.Queue() for _ in range(n_process)]
-												serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos

											
										
										
											2020-01-06 16:57:34 +03:00
+								        # for receiving byte-encoded docs from worker
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        bytedocs_recv_ch, bytedocs_send_ch = zip(
 								            *[mp.Pipe(False) for _ in range(n_process)]
 								        )
-												Tidy up and avoid absolute spacy imports in core

											
										
										
											2020-05-21 21:05:03 +03:00
+								        batch_texts = util.minibatch(texts, batch_size)
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        # Sender sends texts to the workers.
 								        # This is necessary to properly handle infinite length of texts.
 								        # (In this case, all data cannot be sent to the workers at once)
 								        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
-												serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos

											
										
										
											2020-01-06 16:57:34 +03:00
+								        # send twice to make process busy
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        sender.send()
 								        sender.send()
 								        procs = [
-												load Underscore state when multiprocessing

											
										
										
											2020-02-12 13:50:42 +03:00
+								            mp.Process(
 								                target=_apply_pipes,
-												Merge branch 'master' into tmp/sync

											
										
										
											2020-03-26 15:38:14 +03:00
+								                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
-												load Underscore state when multiprocessing

											
										
										
											2020-02-12 13:50:42 +03:00
+								            )
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								            for rch, sch in zip(texts_q, bytedocs_send_ch)
 								        ]
 								        for proc in procs:
 								            proc.start()
 								        # Cycle channels not to break the order of docs.
-												serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos

											
										
										
											2020-01-06 16:57:34 +03:00
+								        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								        byte_tuples = chain.from_iterable(
 								            recv.recv() for recv in cycle(bytedocs_recv_ch)
 								        )
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        try:
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								            for i, (_, (byte_doc, byte_error)) in enumerate(
 								                zip(raw_texts, byte_tuples), 1
 								            ):
-												Handle errors while multiprocessing (#8004)

* Handle errors while multiprocessing

Handle errors while multiprocessing without hanging.

* Return the traceback for errors raised while processing a batch, which
  can be handled by the top-level error handler
* Allow for shortened batches due to custom error handlers that ignore
  errors and skip documents

* Define custom components at a higher level

* Also move up custom error handler

* Use simpler component for test

* Switch error type

* Adjust test

* Only call top-level error handler for exceptions

* Register custom test components within tests

Use global functions (so they can be pickled) but register the
components only within the individual tests.
											
										
										
											2021-05-17 14:28:39 +03:00
+								                if byte_doc is not None:
 								                    doc = Doc(self.vocab).from_bytes(byte_doc)
 								                    yield doc
 								                elif byte_error is not None:
 								                    error = srsly.msgpack_loads(byte_error)
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								                    self.default_error_handler(
 								                        None, None, None, ValueError(Errors.E871.format(error=error))
 								                    )
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								                if i % batch_size == 0:
 								                    # tell `sender` that one batch was consumed.
 								                    sender.step()
 								        finally:
 								            for proc in procs:
 								                proc.terminate()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def _link_components(self) -> None:
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        """Register 'listeners' within pipeline components, to allow them to
 								        effectively share weights.
 								        """
-												Fix linking resumed components (#6859)

* link components across enabled, resumed and frozen

* revert renaming

* revert renaming, the sequel
											
										
										
											2021-02-01 14:19:58 +03:00
+								        # I had thought, "Why do we do this inside the Language object? Shouldn't
-												Add comment

											
										
										
											2020-09-16 18:51:29 +03:00
+								        # it be the tok2vec/transformer/etc's job?
 								        # The problem is we need to do it during deserialization...And the
 								        # components don't receive the pipeline then. So this does have to be
 								        # here :(
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        for i, (name1, proc1) in enumerate(self.pipeline):
 								            if hasattr(proc1, "find_listeners"):
-												Tidy up and auto-format

											
										
										
											2020-09-21 11:59:07 +03:00
+								                for name2, proc2 in self.pipeline[i + 1 :]:
-												warn when frozen components break listener pattern (#6766)

* warn when frozen components break listener pattern

* few notes in the documentation

* update arg name

* formatting

* cleanup

* specify listeners return type
											
										
										
											2021-01-20 03:12:35 +03:00
+								                    proc1.find_listeners(proc2)
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    @classmethod
 								    def from_config(
 								        cls,
 								        config: Union[Dict[str, Any], Config] = {},
-												Make more args keyword-only

											
										
										
											2020-07-27 01:27:53 +03:00
+								        *,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        vocab: Union[Vocab, bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        disable: Iterable[str] = SimpleFrozenList(),
 								        exclude: Iterable[str] = SimpleFrozenList(),
-												Allow overriding meta from spacy.blank

											
										
										
											2020-09-15 12:12:12 +03:00
+								        meta: Dict[str, Any] = SimpleFrozenDict(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        auto_fill: bool = True,
 								        validate: bool = True,
 								    ) -> "Language":
 								        """Create the nlp object from a loaded config. Will set up the tokenizer
 								        and language data, add pipeline components etc. If no config is provided,
 								        the default config of the given language is used.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
 								        config (Dict[str, Any] / Config): The loaded config.
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        vocab (Vocab): A Vocab object. If True, a vocab is created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        disable (Iterable[str]): Names of pipeline components to disable.
 								            Disabled pipes will be loaded but they won't be run unless you
 								            explicitly enable them by calling nlp.enable_pipe.
 								        exclude (Iterable[str]): Names of pipeline components to exclude.
 								            Excluded components won't be loaded.
-												Allow overriding meta from spacy.blank

											
										
										
											2020-09-15 12:12:12 +03:00
+								        meta (Dict[str, Any]): Meta overrides for nlp.meta.
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								        auto_fill (bool): Automatically fill in missing values in config based
 								            on defaults and function argument annotations.
 								        validate (bool): Validate the component config and arguments against
 								            the types expected by the factory.
 								        RETURNS (Language): The initialized Language class.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#from_config
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        """
 								        if auto_fill:
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								            config = Config(
 								                cls.default_config, section_order=CONFIG_SECTION_ORDER
 								            ).merge(config)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        if "nlp" not in config:
 								            raise ValueError(Errors.E985.format(config=config))
-												Fix language detection

											
										
										
											2020-09-29 22:08:13 +03:00
+								        config_lang = config["nlp"].get("lang")
-												Fix lang check and error handling in Language.from_config

											
										
										
											2020-09-15 15:24:06 +03:00
+								        if config_lang is not None and config_lang != cls.lang:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            raise ValueError(
 								                Errors.E958.format(
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								                    bad_lang_code=config["nlp"]["lang"],
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								                    lang_code=cls.lang,
 								                    lang=util.get_object_name(cls),
 								                )
 								            )
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								        config["nlp"]["lang"] = cls.lang
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        # This isn't very elegant, but we remove the [components] block here to prevent
 								        # it from getting resolved (causes problems because we expect to pass in
 								        # the nlp and name args for each component). If we're auto-filling, we're
 								        # using the nlp.config with all defaults.
 								        config = util.copy_config(config)
 								        orig_pipeline = config.pop("components", {})
-												Fixing pretrain (#7342)

* initialize NLP with train corpus

* add more pretraining tests

* more tests

* function to fetch tok2vec layer for pretraining

* clarify parameter name

* test different objectives

* formatting

* fix check for static vectors when using vectors objective

* clarify docs

* logger statement

* fix init_tok2vec and proc.initialize order

* test training after pretraining

* add init_config tests for pretraining

* pop pretraining block to avoid config validation errors

* custom errors
											
										
										
											2021-03-09 06:01:13 +03:00
+								        orig_pretraining = config.pop("pretraining", None)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        config["components"] = {}
-												Fix base schema integration

											
										
										
											2020-09-27 23:50:36 +03:00
+								        if auto_fill:
 								            filled = registry.fill(config, validate=validate, schema=ConfigSchema)
 								        else:
 								            filled = config
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        filled["components"] = orig_pipeline
 								        config["components"] = orig_pipeline
-												Fixing pretrain (#7342)

* initialize NLP with train corpus

* add more pretraining tests

* more tests

* function to fetch tok2vec layer for pretraining

* clarify parameter name

* test different objectives

* formatting

* fix check for static vectors when using vectors objective

* clarify docs

* logger statement

* fix init_tok2vec and proc.initialize order

* test training after pretraining

* add init_config tests for pretraining

* pop pretraining block to avoid config validation errors

* custom errors
											
										
										
											2021-03-09 06:01:13 +03:00
+								        if orig_pretraining is not None:
 								            filled["pretraining"] = orig_pretraining
 								            config["pretraining"] = orig_pretraining
-												Fix base schema integration

											
										
										
											2020-09-27 23:50:36 +03:00
+								        resolved_nlp = registry.resolve(
 								            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
 								        )
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								        create_tokenizer = resolved_nlp["tokenizer"]
 								        before_creation = resolved_nlp["before_creation"]
 								        after_creation = resolved_nlp["after_creation"]
 								        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
-												Add config callbacks for modifying nlp object before and after init (#5866)

* WIP: Concept for modifying nlp object before and after init

* Make callbacks return nlp object

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Raise if callbacks don't return correct type

* Rename, update types, add after_pipeline_creation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-05 20:47:54 +03:00
+								        lang_cls = cls
 								        if before_creation is not None:
 								            lang_cls = before_creation(cls)
 								            if (
 								                not isinstance(lang_cls, type)
 								                or not issubclass(lang_cls, cls)
 								                or lang_cls is not cls
 								            ):
 								                raise ValueError(Errors.E943.format(value=type(lang_cls)))
-												Add warning about GPU selection in Jupyter notebooks (#7075)

* Initial warning

* Update check

* Redo edit

* Move jupyter warning to helper method

* Add link with details to warnings
											
										
										
											2021-03-09 17:35:21 +03:00
 								        # Warn about require_gpu usage in jupyter notebook
 								        warn_if_jupyter_cupy()
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        # Note that we don't load vectors here, instead they get loaded explicitly
 								        # inside stuff like the spacy train function. If we loaded them here,
 								        # then we would load them twice at runtime: once when we make from config,
 								        # and then again when we load from disk.
-												Allow overriding meta from spacy.blank

											
										
										
											2020-09-15 12:12:12 +03:00
+								        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
-												Add config callbacks for modifying nlp object before and after init (#5866)

* WIP: Concept for modifying nlp object before and after init

* Make callbacks return nlp object

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Raise if callbacks don't return correct type

* Rename, update types, add after_pipeline_creation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-05 20:47:54 +03:00
+								        if after_creation is not None:
 								            nlp = after_creation(nlp)
 								            if not isinstance(nlp, cls):
 								                raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								        # To create the components we need to use the final interpolated config
 								        # so all values are available (if component configs use variables).
 								        # Later we replace the component config with the raw config again.
 								        interpolated = filled.interpolate() if not filled.is_interpolated else filled
 								        pipeline = interpolated.get("components", {})
-												Move replacement logic to Language.from_config

											
										
										
											2021-01-29 11:37:04 +03:00
+								        sourced = util.get_sourced_components(interpolated)
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        # If components are loaded from a source (existing models), we cache
 								        # them here so they're only loaded once
 								        source_nlps = {}
-												Fix vectors check for sourced components (#8559)

* Fix vectors check for sourced components

Since vectors are not loaded when components are sourced, store a hash
for the vectors of each sourced component and compare it to the loaded
vectors after the vectors are loaded from the `[initialize]` block.

* Pop temporary info

* Remove stored hash in remove_pipe

* Add default for pop

* Add additional convert/debug/assemble CLI tests
											
										
										
											2021-07-06 13:43:17 +03:00
+								        source_nlp_vectors_hashes = {}
-												Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
											
										
										
											2021-10-04 13:19:02 +03:00
+								        vocab_b = None
-												Simplify language data and revert detailed configs

											
										
										
											2020-07-24 15:50:26 +03:00
+								        for pipe_name in config["nlp"]["pipeline"]:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								            if pipe_name not in pipeline:
 								                opts = ", ".join(pipeline.keys())
 								                raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
-												@factories -> factory (#5801)


											
										
										
											2020-07-22 18:29:31 +03:00
+								            pipe_cfg = util.copy_config(pipeline[pipe_name])
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								            raw_config = Config(filled["components"][pipe_name])
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								            if pipe_name not in exclude:
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								                if "factory" not in pipe_cfg and "source" not in pipe_cfg:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								                    err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
 								                    raise ValueError(err)
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								                if "factory" in pipe_cfg:
 								                    factory = pipe_cfg.pop("factory")
 								                    # The pipe name (key in the config) here is the unique name
 								                    # of the component, not necessarily the factory
 								                    nlp.add_pipe(
-												Update for new Thinc and adjust config

											
										
										
											2020-08-13 18:38:30 +03:00
+								                        factory,
 								                        name=pipe_name,
 								                        config=pipe_cfg,
 								                        validate=validate,
 								                        raw_config=raw_config,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								                    )
 								                else:
-												Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
											
										
										
											2021-10-04 13:19:02 +03:00
+								                    # We need the sourced components to reference the same
 								                    # vocab without modifying the current vocab state **AND**
 								                    # we still want to load the source model vectors to perform
 								                    # the vectors check. Since the source vectors clobber the
 								                    # current ones, we save the original vocab state and
 								                    # restore after this loop. Existing strings are preserved
 								                    # during deserialization, so they do not need any
 								                    # additional handling.
 								                    if vocab_b is None:
 								                        vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								                    model = pipe_cfg["source"]
 								                    if model not in source_nlps:
-												Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
											
										
										
											2021-10-04 13:19:02 +03:00
+								                        # Load with the same vocab, adding any strings
 								                        source_nlps[model] = util.load_model(
 								                            model, vocab=nlp.vocab, exclude=["lookups"]
 								                        )
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								                    source_name = pipe_cfg.get("component", pipe_name)
-												Implement replace_listeners for source in config (#7620)

Implement replace_listeners for sourced components loaded from a config.
											
										
										
											2021-04-08 11:21:22 +03:00
+								                    listeners_replaced = False
 								                    if "replace_listeners" in pipe_cfg:
 								                        for name, proc in source_nlps[model].pipeline:
 								                            if source_name in getattr(proc, "listening_components", []):
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								                                source_nlps[model].replace_listeners(
 								                                    name, source_name, pipe_cfg["replace_listeners"]
 								                                )
-												Implement replace_listeners for source in config (#7620)

Implement replace_listeners for sourced components loaded from a config.
											
										
										
											2021-04-08 11:21:22 +03:00
+								                                listeners_replaced = True
-												Fix vectors check for sourced components (#8559)

* Fix vectors check for sourced components

Since vectors are not loaded when components are sourced, store a hash
for the vectors of each sourced component and compare it to the loaded
vectors after the vectors are loaded from the `[initialize]` block.

* Pop temporary info

* Remove stored hash in remove_pipe

* Add default for pop

* Add additional convert/debug/assemble CLI tests
											
										
										
											2021-07-06 13:43:17 +03:00
+								                    with warnings.catch_warnings():
 								                        warnings.filterwarnings("ignore", message="\\[W113\\]")
-												Auto-format code with black

											
										
										
											2021-07-09 11:06:06 +03:00
+								                        nlp.add_pipe(
 								                            source_name, source=source_nlps[model], name=pipe_name
 								                        )
-												Fix vectors check for sourced components (#8559)

* Fix vectors check for sourced components

Since vectors are not loaded when components are sourced, store a hash
for the vectors of each sourced component and compare it to the loaded
vectors after the vectors are loaded from the `[initialize]` block.

* Pop temporary info

* Remove stored hash in remove_pipe

* Add default for pop

* Add additional convert/debug/assemble CLI tests
											
										
										
											2021-07-06 13:43:17 +03:00
+								                    if model not in source_nlp_vectors_hashes:
-												Auto-format code with black

											
										
										
											2021-07-09 11:06:06 +03:00
+								                        source_nlp_vectors_hashes[model] = hash(
 								                            source_nlps[model].vocab.vectors.to_bytes()
 								                        )
-												Only add sourced vectors hashes to meta if necessary (#8830)


											
										
										
											2021-08-02 19:22:35 +03:00
+								                    if "_sourced_vectors_hashes" not in nlp.meta:
 								                        nlp.meta["_sourced_vectors_hashes"] = {}
-												Auto-format code with black

											
										
										
											2021-07-09 11:06:06 +03:00
+								                    nlp.meta["_sourced_vectors_hashes"][
 								                        pipe_name
 								                    ] = source_nlp_vectors_hashes[model]
-												Implement replace_listeners for source in config (#7620)

Implement replace_listeners for sourced components loaded from a config.
											
										
										
											2021-04-08 11:21:22 +03:00
+								                    # Delete from cache if listeners were replaced
 								                    if listeners_replaced:
 								                        del source_nlps[model]
-												Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
											
										
										
											2021-10-04 13:19:02 +03:00
+								        # Restore the original vocab after sourcing if necessary
 								        if vocab_b is not None:
 								            nlp.vocab.from_bytes(vocab_b)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        disabled_pipes = [*config["nlp"]["disabled"], *disable]
-												Make user-facing Language.disabled return list

More consistent with all the other properties

											
										
										
											2020-08-29 13:08:33 +03:00
+								        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
-												Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.

											
										
										
											2020-12-09 11:13:26 +03:00
+								        nlp.batch_size = config["nlp"]["batch_size"]
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        nlp.config = filled if auto_fill else config
-												Add config callbacks for modifying nlp object before and after init (#5866)

* WIP: Concept for modifying nlp object before and after init

* Make callbacks return nlp object

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Raise if callbacks don't return correct type

* Rename, update types, add after_pipeline_creation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-05 20:47:54 +03:00
+								        if after_pipeline_creation is not None:
 								            nlp = after_pipeline_creation(nlp)
 								            if not isinstance(nlp, cls):
 								                raise ValueError(
 								                    Errors.E942.format(name="pipeline_creation", value=type(nlp))
 								                )
-												Move replacement logic to Language.from_config

											
										
										
											2021-01-29 11:37:04 +03:00
+								        # Detect components with listeners that are not frozen consistently
 								        for name, proc in nlp.pipeline:
-												Proactively remove unused listeners

With this the changes in initialize.py might be unecessary.

Requires testing.

											
										
										
											2021-03-17 16:41:41 +03:00
+								            # Remove listeners not in the pipeline
 								            listener_names = getattr(proc, "listening_components", [])
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								            unused_listener_names = [
 								                ll for ll in listener_names if ll not in nlp.pipe_names
 								            ]
-												Proactively remove unused listeners

With this the changes in initialize.py might be unecessary.

Requires testing.

											
										
										
											2021-03-17 16:41:41 +03:00
+								            for listener_name in unused_listener_names:
 								                for listener in proc.listener_map.get(listener_name, []):
 								                    proc.remove_listener(listener, listener_name)
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								            for listener in getattr(
 								                proc, "listening_components", []
 								            ):  # e.g. tok2vec/transformer
-												Proactively remove unused listeners

With this the changes in initialize.py might be unecessary.

Requires testing.

											
										
										
											2021-03-17 16:41:41 +03:00
+								                # If it's a component sourced from another pipeline, we check if
 								                # the tok2vec listeners should be replaced with standalone tok2vec
 								                # models (e.g. so component can be frozen without its performance
 								                # degrading when other components/tok2vec are updated)
 								                paths = sourced.get(listener, {}).get("replace_listeners", [])
 								                if paths:
 								                    nlp.replace_listeners(name, listener, paths)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        return nlp
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								    def replace_listeners(
-												Tidy up and auto-format

											
										
										
											2021-01-30 04:52:33 +03:00
+								        self,
 								        tok2vec_name: str,
 								        pipe_name: str,
 								        listeners: Iterable[str],
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								    ) -> None:
 								        """Find listener layers (connecting to a token-to-vector embedding
 								        component) of a given pipeline component model and replace
 								        them with a standalone copy of the token-to-vector layer. This can be
 								        useful when training a pipeline with components sourced from an existing
 								        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
 								        the same tok2vec component, but some of them are frozen and not updated,
 								        their performance may degrade significally as the tok2vec component is
 								        updated with new data. To prevent this, listeners can be replaced with
 								        a standalone tok2vec layer that is owned by the component and doesn't
 								        change if the component isn't updated.
 								        tok2vec_name (str): Name of the token-to-vector component, typically
 								            "tok2vec" or "transformer".
 								        pipe_name (str): Name of pipeline component to replace listeners for.
 								        listeners (Iterable[str]): The paths to the listeners, relative to the
 								            component config, e.g. ["model.tok2vec"]. Typically, implementations
 								            will only connect to one tok2vec component, [model.tok2vec], but in
 								            theory, custom models can use multiple listeners. The value here can
 								            either be an empty list to not replace any listeners, or a complete
 								            (!) list of the paths to all listener layers used by the model.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#replace_listeners
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								        """
 								        if tok2vec_name not in self.pipe_names:
-												Improve E889

											
										
										
											2021-01-29 15:39:23 +03:00
+								            err = Errors.E889.format(
 								                tok2vec=tok2vec_name,
 								                name=pipe_name,
 								                unknown=tok2vec_name,
 								                opts=", ".join(self.pipe_names),
 								            )
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								            raise ValueError(err)
 								        if pipe_name not in self.pipe_names:
-												Improve E889

											
										
										
											2021-01-29 15:39:23 +03:00
+								            err = Errors.E889.format(
 								                tok2vec=tok2vec_name,
 								                name=pipe_name,
 								                unknown=pipe_name,
 								                opts=", ".join(self.pipe_names),
 								            )
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								            raise ValueError(err)
 								        tok2vec = self.get_pipe(tok2vec_name)
 								        tok2vec_cfg = self.get_pipe_config(tok2vec_name)
-												call replace_listener_cfg attr if it's available

											
										
										
											2021-05-12 18:19:38 +03:00
+								        tok2vec_model = tok2vec.model
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								        if (
 								            not hasattr(tok2vec, "model")
 								            or not hasattr(tok2vec, "listener_map")
-												Add check for remove_listener method

											
										
										
											2021-01-29 15:47:30 +03:00
+								            or not hasattr(tok2vec, "remove_listener")
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								            or "model" not in tok2vec_cfg
 								        ):
 								            raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
 								        pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
-												call replace_listener_cfg attr if it's available

											
										
										
											2021-05-12 18:19:38 +03:00
+								        pipe = self.get_pipe(pipe_name)
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								        pipe_cfg = self._pipe_configs[pipe_name]
 								        if listeners:
 								            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
 								            if len(listeners) != len(pipe_listeners):
 								                # The number of listeners defined in the component model doesn't
 								                # match the listeners to replace, so we won't be able to update
 								                # the nodes and generate a matching config
 								                err = Errors.E887.format(
 								                    name=pipe_name,
 								                    tok2vec=tok2vec_name,
 								                    paths=listeners,
 								                    n_listeners=len(pipe_listeners),
 								                )
 								                raise ValueError(err)
-												Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-01-29 13:41:17 +03:00
+								            # Update the config accordingly by copying the tok2vec model to all
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								            # sections defined in the listener paths
 								            for listener_path in listeners:
 								                # Check if the path actually exists in the config
 								                try:
 								                    util.dot_to_object(pipe_cfg, listener_path)
 								                except KeyError:
 								                    err = Errors.E886.format(
 								                        name=pipe_name, tok2vec=tok2vec_name, path=listener_path
 								                    )
 								                    raise ValueError(err)
-												call replace_listener_cfg attr if it's available

											
										
										
											2021-05-12 18:19:38 +03:00
+								                new_config = tok2vec_cfg["model"]
 								                if "replace_listener_cfg" in tok2vec_model.attrs:
 								                    replace_func = tok2vec_model.attrs["replace_listener_cfg"]
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								                    new_config = replace_func(
 								                        tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
 								                    )
-												call replace_listener_cfg attr if it's available

											
										
										
											2021-05-12 18:19:38 +03:00
+								                util.set_dot_to_object(pipe_cfg, listener_path, new_config)
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
+								            # Go over the listener layers and replace them
 								            for listener in pipe_listeners:
-												call replace_listener_cfg attr if it's available

											
										
										
											2021-05-12 18:19:38 +03:00
+								                new_model = tok2vec_model.copy()
 								                if "replace_listener" in tok2vec_model.attrs:
 								                    new_model = tok2vec_model.attrs["replace_listener"](new_model)
-												call replace_listener attr if it's available

											
										
										
											2021-05-12 17:01:02 +03:00
+								                util.replace_model_node(pipe.model, listener, new_model)
-												Move replacement logic to Language.from_config

											
										
										
											2021-01-29 11:37:04 +03:00
+								                tok2vec.remove_listener(listener, pipe_name)
-												Change method order

											
										
										
											2021-01-29 10:42:41 +03:00
-												Update docs and consistency

											
										
										
											2020-07-29 16:14:07 +03:00
+								    def to_disk(
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-												Update docs and consistency

											
										
										
											2020-07-29 16:14:07 +03:00
+								    ) -> None:
-												Use disable argument (list) for serialization

											
										
										
											2017-05-26 13:33:54 +03:00
+								        """Save the current state to a directory.  If a model is loaded, this
 								        will include the model.
-												Update docstrings

											
										
										
											2017-04-17 02:40:26 +03:00
-												unicode -> str consistency [ci skip]

											
										
										
											2020-05-24 19:51:10 +03:00
+								        path (str / Path): Path to a directory, which will be created if
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            it doesn't exist.
 								        exclude (list): Names of components or serialization fields to exclude.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#to_disk
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        """
 								        path = util.ensure_path(path)
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        serializers = {}
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
 								            p, exclude=["vocab"]
 								        )
-												issue5230: fixed resource warnings in language

											
										
										
											2020-04-06 19:54:32 +03:00
+								        serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
+								        serializers["config.cfg"] = lambda p: self.config.to_disk(p)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for name, proc in self._components:
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            if name in exclude:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								                continue
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            if not hasattr(proc, "to_disk"):
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								                continue
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
-												Pass excludes when serializing vocab (#8824)

* Pass excludes when serializing vocab

Additional minor bug fix:

* Deserialize vocab in `EntityLinker.from_disk`

* Add test for excluding strings on load

* Fix formatting
											
										
										
											2021-08-03 15:42:44 +03:00
+								        serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        util.to_disk(path, serializers, exclude)
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def from_disk(
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								        self,
 								        path: Union[str, Path],
 								        *,
 								        exclude: Iterable[str] = SimpleFrozenList(),
 								        overrides: Dict[str, Any] = SimpleFrozenDict(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ) -> "Language":
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        """Loads state from a directory. Modifies the object in place and
-												Use disable argument (list) for serialization

											
										
										
											2017-05-26 13:33:54 +03:00
+								        returns it. If the saved `Language` object contains a model, the
 								        model will be loaded.
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
-												unicode -> str consistency [ci skip]

											
										
										
											2020-05-24 19:51:10 +03:00
+								        path (str / Path): A path to a directory.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): Names of components or serialization fields to exclude.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        RETURNS (Language): The modified `Language` object.
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#from_disk
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        """
-												Merge branch 'develop' into master-tmp

											
										
										
											2020-06-20 16:52:00 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        def deserialize_meta(path: Path) -> None:
-												Improve vector name loading from model meta

											
										
										
											2020-05-27 15:48:54 +03:00
+								            if path.exists():
 								                data = srsly.read_json(path)
 								                self.meta.update(data)
 								                # self.meta always overrides meta["vectors"] with the metadata
 								                # from self.vocab.vectors, so set the name directly
 								                self.vocab.vectors.name = data.get("vectors", {}).get("name")
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        def deserialize_vocab(path: Path) -> None:
-												Improve vector name loading from model meta

											
										
										
											2020-05-27 15:48:54 +03:00
+								            if path.exists():
-												Pass excludes when serializing vocab (#8824)

* Pass excludes when serializing vocab

Additional minor bug fix:

* Deserialize vocab in `EntityLinker.from_disk`

* Add test for excluding strings on load

* Fix formatting
											
										
										
											2021-08-03 15:42:44 +03:00
+								                self.vocab.from_disk(path, exclude=exclude)
-												Improve vector name loading from model meta

											
										
										
											2020-05-27 15:48:54 +03:00
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        path = util.ensure_path(path)
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        deserializers = {}
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
+								        if Path(path / "config.cfg").exists():
-												Don't interpolate config on Language deserialization

											
										
										
											2020-08-27 17:44:36 +03:00
+								            deserializers["config.cfg"] = lambda p: self.config.from_disk(
-												Address missing config overrides post load of models (#8208)


											
										
										
											2021-05-31 11:36:52 +03:00
+								                p, interpolate=False, overrides=overrides
-												Don't interpolate config on Language deserialization

											
										
										
											2020-08-27 17:44:36 +03:00
+								            )
-												Improve vector name loading from model meta

											
										
										
											2020-05-27 15:48:54 +03:00
+								        deserializers["meta.json"] = deserialize_meta
 								        deserializers["vocab"] = deserialize_vocab
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
 								            p, exclude=["vocab"]
 								        )
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for name, proc in self._components:
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            if name in exclude:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								                continue
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            if not hasattr(proc, "from_disk"):
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								                continue
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								            deserializers[name] = lambda p, proc=proc: proc.from_disk(
 								                p, exclude=["vocab"]
 								            )
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        if not (path / "vocab").exists() and "vocab" not in exclude:
 								            # Convert to list here in case exclude is (default) tuple
 								            exclude = list(exclude) + ["vocab"]
-												Only load vocab if it exists

											
										
										
											2017-06-01 15:38:35 +03:00
+								        util.from_disk(path, deserializers, exclude)
-												Add Language.path

											
										
										
											2017-10-25 12:57:43 +03:00
+								        self._path = path
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        self._link_components()
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								        return self
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        """Serialize the current state to a binary string.
-												Remove trailing whitespace

											
										
										
											2016-12-18 18:54:52 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): Names of components or serialization fields to exclude.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        RETURNS (bytes): The serialized form of the `Language` object.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#to_bytes
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        """
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        serializers = {}
-												Pass excludes when serializing vocab (#8824)

* Pass excludes when serializing vocab

Additional minor bug fix:

* Deserialize vocab in `EntityLinker.from_disk`

* Add test for excluding strings on load

* Fix formatting
											
										
										
											2021-08-03 15:42:44 +03:00
+								        serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
 								        serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
+								        serializers["config.cfg"] = lambda: self.config.to_bytes()
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for name, proc in self._components:
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            if name in exclude:
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
+								                continue
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            if not hasattr(proc, "to_bytes"):
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
+								                continue
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
-												Allow reasonably efficient pickling of Language class, using to_bytes() and from_bytes().

											
										
										
											2017-10-17 19:18:10 +03:00
+								        return util.to_bytes(serializers, exclude)
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def from_bytes(
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ) -> "Language":
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        """Load state from a binary string.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        bytes_data (bytes): The data to load from.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): Names of components or serialization fields to exclude.
-												Update docstrings and API docs for Language class

											
										
										
											2017-05-19 00:57:38 +03:00
+								        RETURNS (Language): The `Language` object.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/language#from_bytes
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        """
-												Merge branch 'develop' into master-tmp

											
										
										
											2020-06-20 16:52:00 +03:00
-												Improve vector name loading from model meta

											
										
										
											2020-05-27 15:48:54 +03:00
+								        def deserialize_meta(b):
 								            data = srsly.json_loads(b)
 								            self.meta.update(data)
 								            # self.meta always overrides meta["vectors"] with the metadata
 								            # from self.vocab.vectors, so set the name directly
 								            self.vocab.vectors.name = data.get("vectors", {}).get("name")
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        deserializers = {}
-												Don't interpolate config on Language deserialization

											
										
										
											2020-08-27 17:44:36 +03:00
+								        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
 								            b, interpolate=False
 								        )
-												Improve vector name loading from model meta

											
										
										
											2020-05-27 15:48:54 +03:00
+								        deserializers["meta.json"] = deserialize_meta
-												Pass excludes when serializing vocab (#8824)

* Pass excludes when serializing vocab

Additional minor bug fix:

* Deserialize vocab in `EntityLinker.from_disk`

* Add test for excluding strings on load

* Fix formatting
											
										
										
											2021-08-03 15:42:44 +03:00
+								        deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
 								            b, exclude=["vocab"]
 								        )
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								        for name, proc in self._components:
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            if name in exclude:
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
+								                continue
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            if not hasattr(proc, "from_bytes"):
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
+								                continue
-												ensure the lang of vocab and nlp stay consistent (#4057)

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

											
										
										
											2019-08-01 18:13:01 +03:00
+								            deserializers[name] = lambda b, proc=proc: proc.from_bytes(
 								                b, exclude=["vocab"]
 								            )
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        util.from_bytes(bytes_data, deserializers, exclude)
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								        self._link_components()
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        return self
-												Add deprojectivize to pipeline

											
										
										
											2017-05-22 02:43:31 +03:00
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								@dataclass
 								class FactoryMeta:
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								    """Dataclass containing information about a component and its defaults
 								    provided by the @Language.component or @Language.factory decorator. It's
 								    created whenever a component is defined and stored on the Language class for
 								    each component instance and factory instance.
 								    """
-												Update docs and consistency

											
										
										
											2020-07-29 16:14:07 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    factory: str
 								    default_config: Optional[Dict[str, Any]] = None  # noqa: E704
 								    assigns: Iterable[str] = tuple()
 								    requires: Iterable[str] = tuple()
 								    retokenizes: bool = False
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    scores: Iterable[str] = tuple()
-												Remove scores list from config and document

											
										
										
											2020-07-28 12:22:24 +03:00
+								    default_score_weights: Optional[Dict[str, float]] = None  # noqa: E704
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
+								class DisabledPipes(list):
-												Tidy up language, lemmatizer and scorer

											
										
										
											2017-10-27 15:40:14 +03:00
+								    """Manager for temporary pipeline disabling."""
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Tidy up docstrings and arguments

											
										
										
											2020-07-29 00:12:42 +03:00
+								    def __init__(self, nlp: Language, names: List[str]) -> None:
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
+								        self.nlp = nlp
 								        self.names = names
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        for name in self.names:
 								            self.nlp.disable_pipe(name)
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
+								        list.__init__(self)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        self.extend(self.names)
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
 								    def __enter__(self):
-												Return self on __enter__

											
										
										
											2017-10-25 15:56:16 +03:00
+								        return self
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
 								    def __exit__(self, *args):
 								        self.restore()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def restore(self) -> None:
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        """Restore the pipeline to its state when DisabledPipes was created."""
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        for name in self.names:
-												Rename user-facing API

											
										
										
											2020-08-28 22:04:02 +03:00
+								            if name not in self.nlp.component_names:
-												Raise if disabled components are removed before DisabledPipes.restore

											
										
										
											2020-08-28 21:35:26 +03:00
+								                raise ValueError(Errors.E008.format(name=name))
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								            self.nlp.enable_pipe(name)
-												Add Language.disable_pipes()

											
										
										
											2017-10-25 14:46:41 +03:00
+								        self[:] = []
-												Copy the Example objects (and their predicted Doc) in nlp.evaluate() and nlp.update() (#6765)

* Make copy of examples in nlp.update and nlp.evaluate

* Avoid circular import

* Fix evaluate
											
										
										
											2021-01-19 18:47:44 +03:00
+								def _copy_examples(examples: Iterable[Example]) -> List[Example]:
 								    """Make a copy of a batch of examples, copying the predicted Doc as well.
 								    This is used in contexts where we need to take ownership of the examples
-												Tidy up code comments [ci skip]

											
										
										
											2021-01-27 04:40:03 +03:00
+								    so that they can be mutated, for instance during Language.evaluate and
-												Copy the Example objects (and their predicted Doc) in nlp.evaluate() and nlp.update() (#6765)

* Make copy of examples in nlp.update and nlp.evaluate

* Avoid circular import

* Fix evaluate
											
										
										
											2021-01-19 18:47:44 +03:00
+								    Language.update.
 								    """
 								    return [Example(eg.x.copy(), eg.y) for eg in examples]
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def _apply_pipes(
 								    make_doc: Callable[[str], Doc],
 								    pipes: Iterable[Callable[[Doc], Doc]],
 								    receiver,
 								    sender,
 								    underscore_state: Tuple[dict, dict, dict],
 								) -> None:
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								    """Worker for Language.pipe
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    make_doc (Callable[[str,] Doc]): Function to create Doc from text.
 								    pipes (Iterable[Callable[[Doc], Doc]]): The components to apply.
-												Fix formatting [ci skip]

											
										
										
											2019-10-18 12:33:38 +03:00
+								    receiver (multiprocessing.Connection): Pipe to receive text. Usually
 								        created by `multiprocessing.Pipe()`
 								    sender (multiprocessing.Connection): Pipe to send doc. Usually created by
 								        `multiprocessing.Pipe()`
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    underscore_state (Tuple[dict, dict, dict]): The data in the Underscore class
 								        of the parent.
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								    """
-												load Underscore state when multiprocessing

											
										
										
											2020-02-12 13:50:42 +03:00
+								    Underscore.load_state(underscore_state)
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								    while True:
-												Handle errors while multiprocessing (#8004)

* Handle errors while multiprocessing

Handle errors while multiprocessing without hanging.

* Return the traceback for errors raised while processing a batch, which
  can be handled by the top-level error handler
* Allow for shortened batches due to custom error handlers that ignore
  errors and skip documents

* Define custom components at a higher level

* Also move up custom error handler

* Use simpler component for test

* Switch error type

* Adjust test

* Only call top-level error handler for exceptions

* Register custom test components within tests

Use global functions (so they can be pickled) but register the
components only within the individual tests.
											
										
										
											2021-05-17 14:28:39 +03:00
+								        try:
 								            texts = receiver.get()
 								            docs = (make_doc(text) for text in texts)
 								            for pipe in pipes:
 								                docs = pipe(docs)
 								            # Connection does not accept unpickable objects, so send list.
 								            byte_docs = [(doc.to_bytes(), None) for doc in docs]
 								            padding = [(None, None)] * (len(texts) - len(byte_docs))
 								            sender.send(byte_docs + padding)
 								        except Exception:
 								            error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
 								            padding = [(None, None)] * (len(texts) - 1)
 								            sender.send(error_msg + padding)
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
 								class _Sender:
 								    """Util for sending data to multiprocessing workers in Language.pipe"""
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def __init__(
 								        self, data: Iterable[Any], queues: List[mp.Queue], chunk_size: int
 								    ) -> None:
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        self.data = iter(data)
 								        self.queues = iter(cycle(queues))
 								        self.chunk_size = chunk_size
 								        self.count = 0
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def send(self) -> None:
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        """Send chunk_size items from self.data to channels."""
 								        for item, q in itertools.islice(
 								            zip(self.data, cycle(self.queues)), self.chunk_size
 								        ):
 								            # cycle channels so that distribute the texts evenly
 								            q.put(item)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    def step(self) -> None:
 								        """Tell sender that comsumed one item. Data is sent to the workers after
 								        every chunk_size calls.
 								        """
-												multiprocessing pipe (#1303) (#4371)

* refactor: separate formatting docs and golds in Language.update

* fix return typo

* add pipe test

* unpickleable object cannot be assigned to p.map

* passed test pipe

* passed test!

* pipe terminate

* try pipe

* passed test

* fix ch

* add comments

* fix len(texts)

* add comment

* add comment

* fix: multiprocessing of pipe is not supported in 2

* test: use assert_docs_equal

* fix: is_python3 -> is_python2

* fix: change _pipe arg to use functools.partial

* test: add vector modification test

* test: add sample ner_pipe and user_data pipe

* add warnings test

* test: fix user warnings

* test: fix warnings capture

* fix: remove islice import

* test: remove warnings test

* test: add stream test

* test: rename

* fix: multiproc stream

* fix: stream pipe

* add comment

* mp.Pipe seems to be able to use with relative small data

* test: skip stream test in python2

* sort imports

* test: add reason to skiptest

* fix: use pipe for docs communucation

* add comments

* add comment

											
										
										
											2019-10-08 13:20:55 +03:00
+								        self.count += 1
 								        if self.count >= self.chunk_size:
 								            self.count = 0
 								            self.send()