spaCy/spacy/util.py

from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast
from typing import Optional, Iterable, Callable, Tuple, Type
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
from types import ModuleType
import os
import importlib
import importlib.util
import re
from pathlib import Path
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
from thinc.api import ConfigValidationError, Model
import functools
import itertools
import numpy.random
import numpy
import srsly
import catalogue
from catalogue import RegistryError, Registry
import langcodes
import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
from packaging.requirements import Requirement
import subprocess
from contextlib import contextmanager
from collections import defaultdict
import tempfile
import shutil
import shlex
import inspect
import pkgutil
import logging

try:
    import cupy.random
except ImportError:
    cupy = None

# These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
# not from spacy.util.
from thinc.api import fix_random_seed, compounding, decaying  # noqa: F401


from .symbols import ORTH
from .compat import cupy, CudaStream, is_windows, importlib_metadata
from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about

if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
    from .pipeline import Pipe  # noqa: F401
    from .tokens import Doc, Span  # noqa: F401
    from .vocab import Vocab  # noqa: F401


# fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]

# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
# fmt: on


logger = logging.getLogger("spacy")
logger_stream_handler = logging.StreamHandler()
logger_stream_handler.setFormatter(
    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
)
logger.addHandler(logger_stream_handler)


class ENV_VARS:
    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"


class registry(thinc.registry):
    languages = catalogue.create("spacy", "languages", entry_points=True)
    architectures = catalogue.create("spacy", "architectures", entry_points=True)
    tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True)
    lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
    lookups = catalogue.create("spacy", "lookups", entry_points=True)
    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
    misc = catalogue.create("spacy", "misc", entry_points=True)
    # Callback functions used to manipulate nlp object etc.
    callbacks = catalogue.create("spacy", "callbacks", entry_points=True)
    batchers = catalogue.create("spacy", "batchers", entry_points=True)
    readers = catalogue.create("spacy", "readers", entry_points=True)
    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
    loggers = catalogue.create("spacy", "loggers", entry_points=True)
    scorers = catalogue.create("spacy", "scorers", entry_points=True)
    # These are factories registered via third-party packages and the
    # spacy_factories entry point. This registry only exists so we can easily
    # load them via the entry points. The "true" factories are added via the
    # Language.factory decorator (in the spaCy code base and user code) and those
    # are the factories used to initialize components via registry.resolve.
    _entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
    factories = catalogue.create("spacy", "internal_factories")
    # This is mostly used to get a list of all installed models in the current
    # environment. spaCy models packaged with `spacy package` will "advertise"
    # themselves via entry points.
    models = catalogue.create("spacy", "models", entry_points=True)
    cli = catalogue.create("spacy", "cli", entry_points=True)

    @classmethod
    def get_registry_names(cls) -> List[str]:
        """List all available registries."""
        names = []
        for name, value in inspect.getmembers(cls):
            if not name.startswith("_") and isinstance(value, Registry):
                names.append(name)
        return sorted(names)

    @classmethod
    def get(cls, registry_name: str, func_name: str) -> Callable:
        """Get a registered function from the registry."""
        # We're overwriting this classmethod so we're able to provide more
        # specific error messages and implement a fallback to spacy-legacy.
        if not hasattr(cls, registry_name):
            names = ", ".join(cls.get_registry_names()) or "none"
            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
        reg = getattr(cls, registry_name)
        try:
            func = reg.get(func_name)
        except RegistryError:
            if func_name.startswith("spacy."):
                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
                try:
                    return reg.get(legacy_name)
                except catalogue.RegistryError:
                    pass
            available = ", ".join(sorted(reg.get_all().keys())) or "none"
            raise RegistryError(
                Errors.E893.format(
                    name=func_name, reg_name=registry_name, available=available
                )
            ) from None
        return func

    @classmethod
    def find(cls, registry_name: str, func_name: str) -> Callable:
        """Get info about a registered function from the registry."""
        # We're overwriting this classmethod so we're able to provide more
        # specific error messages and implement a fallback to spacy-legacy.
        if not hasattr(cls, registry_name):
            names = ", ".join(cls.get_registry_names()) or "none"
            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
        reg = getattr(cls, registry_name)
        try:
            func_info = reg.find(func_name)
        except RegistryError:
            if func_name.startswith("spacy."):
                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
                try:
                    return reg.find(legacy_name)
                except catalogue.RegistryError:
                    pass
            available = ", ".join(sorted(reg.get_all().keys())) or "none"
            raise RegistryError(
                Errors.E893.format(
                    name=func_name, reg_name=registry_name, available=available
                )
            ) from None
        return func_info

    @classmethod
    def has(cls, registry_name: str, func_name: str) -> bool:
        """Check whether a function is available in a registry."""
        if not hasattr(cls, registry_name):
            return False
        reg = getattr(cls, registry_name)
        if func_name.startswith("spacy."):
            legacy_name = func_name.replace("spacy.", "spacy-legacy.")
            return func_name in reg or legacy_name in reg
        return func_name in reg


class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty
    dictionary). Will raise an error if user or spaCy attempts to add to dict.
    """

    def __init__(self, *args, error: str = Errors.E095, **kwargs) -> None:
        """Initialize the frozen dict. Can be initialized with pre-defined
        values.

        error (str): The error message when user tries to assign to dict.
        """
        super().__init__(*args, **kwargs)
        self.error = error

    def __setitem__(self, key, value):
        raise NotImplementedError(self.error)

    def pop(self, key, default=None):
        raise NotImplementedError(self.error)

    def update(self, other):
        raise NotImplementedError(self.error)


class SimpleFrozenList(list):
    """Wrapper class around a list that lets us raise custom errors if certain
    attributes/methods are accessed. Mostly used for properties like
    Language.pipeline that return an immutable list (and that we don't want to
    convert to a tuple to not break too much backwards compatibility). If a user
    accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
    """

    def __init__(self, *args, error: str = Errors.E927) -> None:
        """Initialize the frozen list.

        error (str): The error message when user tries to mutate the list.
        """
        self.error = error
        super().__init__(*args)

    def append(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def clear(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def extend(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def insert(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def pop(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def remove(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def reverse(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def sort(self, *args, **kwargs):
        raise NotImplementedError(self.error)


def lang_class_is_loaded(lang: str) -> bool:
    """Check whether a Language class is already loaded. Language classes are
    loaded lazily, to avoid expensive setup code associated with the language
    data.

    lang (str): Two-letter language code, e.g. 'en'.
    RETURNS (bool): Whether a Language class has been loaded.
    """
    return lang in registry.languages


def find_matching_language(lang: str) -> Optional[str]:
    """
    Given an IETF language code, find a supported spaCy language that is a
    close match for it (according to Unicode CLDR language-matching rules).
    This allows for language aliases, ISO 639-2 codes, more detailed language
    tags, and close matches.

    Returns the language code if a matching language is available, or None
    if there is no matching language.

    >>> find_matching_language('en')
    'en'
    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
    'pt'
    >>> find_matching_language('fra')  # an ISO 639-2 code for French
    'fr'
    >>> find_matching_language('iw')  # obsolete alias for Hebrew
    'he'
    >>> find_matching_language('no')  # Norwegian
    'nb'
    >>> find_matching_language('mo')  # old code for ro-MD
    'ro'
    >>> find_matching_language('zh-Hans')  # Simplified Chinese
    'zh'
    >>> find_matching_language('zxx')
    None
    """
    import spacy.lang  # noqa: F401

    if lang == "xx":
        return "xx"

    # Find out which language modules we have
    possible_languages = []
    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined]
        code = modinfo.name
        if code == "xx":
            # Temporarily make 'xx' into a valid language code
            possible_languages.append("mul")
        elif langcodes.tag_is_valid(code):
            possible_languages.append(code)

    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
    # more possibilities, like variants of Chinese like 'wuu', but text that
    # is labeled that way is probably trying to be distinct from 'zh' and
    # shouldn't automatically match.
    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
    if match == "mul":
        # Convert 'mul' back to spaCy's 'xx'
        return "xx"
    else:
        return match


def get_lang_class(lang: str) -> Type["Language"]:
    """Import and load a Language class.

    lang (str): IETF language code, such as 'en'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
    if lang in registry.languages:
        return registry.languages.get(lang)
    else:
        # Find the language in the spacy.lang subpackage
        try:
            module = importlib.import_module(f".lang.{lang}", "spacy")
        except ImportError as err:
            # Find a matching language. For example, if the language 'no' is
            # requested, we can use language-matching to load `spacy.lang.nb`.
            try:
                match = find_matching_language(lang)
            except langcodes.tag_parser.LanguageTagError:
                # proceed to raising an import error
                match = None

            if match:
                lang = match
                module = importlib.import_module(f".lang.{lang}", "spacy")
            else:
                raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
        set_lang_class(lang, getattr(module, module.__all__[0]))  # type: ignore[attr-defined]
    return registry.languages.get(lang)


def set_lang_class(name: str, cls: Type["Language"]) -> None:
    """Set a custom Language class name that can be loaded via get_lang_class.

    name (str): Name of Language class.
    cls (Language): Language class.
    """
    registry.languages.register(name, func=cls)


def ensure_path(path: Any) -> Any:
    """Ensure string is converted to a Path.

    path (Any): Anything. If string, it's converted to Path.
    RETURNS: Path or original argument.
    """
    if isinstance(path, str):
        return Path(path)
    else:
        return path


def load_language_data(path: Union[str, Path]) -> Union[dict, list]:
    """Load JSON language data using the given path as a base. If the provided
    path isn't present, will attempt to load a gzipped version before giving up.

    path (str / Path): The data to load.
    RETURNS: The loaded data.
    """
    path = ensure_path(path)
    if path.exists():
        return srsly.read_json(path)
    path = path.with_suffix(path.suffix + ".gz")
    if path.exists():
        return srsly.read_gzip_json(path)
    raise ValueError(Errors.E160.format(path=path))


def get_module_path(module: ModuleType) -> Path:
    """Get the path of a Python module.

    module (ModuleType): The Python module.
    RETURNS (Path): The path.
    """
    if not hasattr(module, "__module__"):
        raise ValueError(Errors.E169.format(module=repr(module)))
    file_path = Path(cast(os.PathLike, sys.modules[module.__module__].__file__))
    return file_path.parent


def load_model(
    name: Union[str, Path],
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Load a model from a package or data path.

    name (str): Package name or model path.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
    if isinstance(name, str):  # name or string path
        if name.startswith("blank:"):  # shortcut for blank model
            return get_lang_class(name.replace("blank:", ""))()
        if is_package(name):  # installed as package
            return load_model_from_package(name, **kwargs)  # type: ignore[arg-type]
        if Path(name).exists():  # path to model data directory
            return load_model_from_path(Path(name), **kwargs)  # type: ignore[arg-type]
    elif hasattr(name, "exists"):  # Path or Path-like to model data
        return load_model_from_path(name, **kwargs)  # type: ignore[arg-type]
    if name in OLD_MODEL_SHORTCUTS:
        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))  # type: ignore[index]
    raise IOError(Errors.E050.format(name=name))


def load_model_from_package(
    name: str,
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Load a model from an installed package.

    name (str): The package name.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    cls = importlib.import_module(name)
    return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)  # type: ignore[attr-defined]


def load_model_from_path(
    model_path: Path,
    *,
    meta: Optional[Dict[str, Any]] = None,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Load a model from a data directory path. Creates Language class with
    pipeline from config.cfg and then calls from_disk() with path.

    model_path (Path): Model path.
    meta (Dict[str, Any]): Optional model meta.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    if not model_path.exists():
        raise IOError(Errors.E052.format(path=model_path))
    if not meta:
        meta = get_model_meta(model_path)
    config_path = model_path / "config.cfg"
    overrides = dict_to_dot(config)
    config = load_config(config_path, overrides=overrides)
    nlp = load_model_from_config(
        config, vocab=vocab, disable=disable, exclude=exclude, meta=meta
    )
    return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)


def load_model_from_config(
    config: Union[Dict[str, Any], Config],
    *,
    meta: Dict[str, Any] = SimpleFrozenDict(),
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    auto_fill: bool = False,
    validate: bool = True,
) -> "Language":
    """Create an nlp object from a config. Expects the full config file including
    a section "nlp" containing the settings for the nlp object.

    name (str): Package name or model path.
    meta (Dict[str, Any]): Optional model meta.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    auto_fill (bool): Whether to auto-fill config with missing defaults.
    validate (bool): Whether to show config validation errors.
    RETURNS (Language): The loaded nlp object.
    """
    if "nlp" not in config:
        raise ValueError(Errors.E985.format(config=config))
    nlp_config = config["nlp"]
    if "lang" not in nlp_config or nlp_config["lang"] is None:
        raise ValueError(Errors.E993.format(config=nlp_config))
    # This will automatically handle all codes registered via the languages
    # registry, including custom subclasses provided via entry points
    lang_cls = get_lang_class(nlp_config["lang"])
    nlp = lang_cls.from_config(
        config,
        vocab=vocab,
        disable=disable,
        exclude=exclude,
        auto_fill=auto_fill,
        validate=validate,
        meta=meta,
    )
    return nlp


def get_sourced_components(
    config: Union[Dict[str, Any], Config]
) -> Dict[str, Dict[str, Any]]:
    """RETURNS (List[str]): All sourced components in the original config,
    e.g. {"source": "en_core_web_sm"}. If the config contains a key
    "factory", we assume it refers to a component factory.
    """
    return {
        name: cfg
        for name, cfg in config.get("components", {}).items()
        if "factory" not in cfg and "source" in cfg
    }


def resolve_dot_names(
    config: Config, dot_names: List[Optional[str]]
) -> Tuple[Any, ...]:
    """Resolve one or more "dot notation" names, e.g. corpora.train.
    The paths could point anywhere into the config, so we don't know which
    top-level section we'll be looking within.

    We resolve the whole top-level section, although we could resolve less --
    we could find the lowest part of the tree.
    """
    # TODO: include schema?
    resolved = {}
    output: List[Any] = []
    errors = []
    for name in dot_names:
        if name is None:
            output.append(name)
        else:
            section = name.split(".")[0]
            # We want to avoid resolving the same thing twice
            if section not in resolved:
                if registry.is_promise(config[section]):
                    # Otherwise we can't resolve [corpus] if it's a promise
                    result = registry.resolve({"config": config[section]})["config"]
                else:
                    result = registry.resolve(config[section])
                resolved[section] = result
            try:
                output.append(dot_to_object(resolved, name))  # type: ignore[arg-type]
            except KeyError:
                msg = f"not a valid section reference: {name}"
                errors.append({"loc": name.split("."), "msg": msg})
    if errors:
        raise ConfigValidationError(config=config, errors=errors)
    return tuple(output)


def load_model_from_init_py(
    init_file: Union[Path, str],
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Helper function to use in the `load()` method of a model package's
    __init__.py.

    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    model_path = Path(init_file).parent
    meta = get_model_meta(model_path)
    data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
    data_path = model_path / data_dir
    if not model_path.exists():
        raise IOError(Errors.E052.format(path=data_path))
    return load_model_from_path(
        data_path,
        vocab=vocab,
        meta=meta,
        disable=disable,
        exclude=exclude,
        config=config,
    )


def load_config(
    path: Union[str, Path],
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    interpolate: bool = False,
) -> Config:
    """Load a config file. Takes care of path validation and section order.

    path (Union[str, Path]): Path to the config file or "-" to read from stdin.
    overrides: (Dict[str, Any]): Config overrides as nested dict or
        dict keyed by section values in dot notation.
    interpolate (bool): Whether to interpolate and resolve variables.
    RETURNS (Config): The loaded config.
    """
    config_path = ensure_path(path)
    config = Config(section_order=CONFIG_SECTION_ORDER)
    if str(config_path) == "-":  # read from standard input
        return config.from_str(
            sys.stdin.read(), overrides=overrides, interpolate=interpolate
        )
    else:
        if not config_path or not config_path.is_file():
            raise IOError(Errors.E053.format(path=config_path, name="config file"))
        return config.from_disk(
            config_path, overrides=overrides, interpolate=interpolate
        )


def load_config_from_str(
    text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
):
    """Load a full config from a string. Wrapper around Thinc's Config.from_str.

    text (str): The string config to load.
    interpolate (bool): Whether to interpolate and resolve variables.
    RETURNS (Config): The loaded config.
    """
    return Config(section_order=CONFIG_SECTION_ORDER).from_str(
        text, overrides=overrides, interpolate=interpolate
    )


def get_installed_models() -> List[str]:
    """List all model packages currently installed in the environment.

    RETURNS (List[str]): The string names of the models.
    """
    return list(registry.models.get_all().keys())


def get_package_version(name: str) -> Optional[str]:
    """Get the version of an installed package. Typically used to get model
    package versions.

    name (str): The name of the installed Python package.
    RETURNS (str / None): The version or None if package not installed.
    """
    try:
        return importlib_metadata.version(name)  # type: ignore[attr-defined]
    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
        return None


def is_compatible_version(
    version: str, constraint: str, prereleases: bool = True
) -> Optional[bool]:
    """Check if a version (e.g. "2.0.0") is compatible given a version
    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
    it's interpreted as =={version}.

    version (str): The version to check.
    constraint (str): The constraint string.
    prereleases (bool): Whether to allow prereleases. If set to False,
        prerelease versions will be considered incompatible.
    RETURNS (bool / None): Whether the version is compatible, or None if the
        version or constraint are invalid.
    """
    # Handle cases where exact version is provided as constraint
    if constraint[0].isdigit():
        constraint = f"=={constraint}"
    try:
        spec = SpecifierSet(constraint)
        version = Version(version)  # type: ignore[assignment]
    except (InvalidSpecifier, InvalidVersion):
        return None
    spec.prereleases = prereleases
    return version in spec


def is_unconstrained_version(
    constraint: str, prereleases: bool = True
) -> Optional[bool]:
    # We have an exact version, this is the ultimate constrained version
    if constraint[0].isdigit():
        return False
    try:
        spec = SpecifierSet(constraint)
    except InvalidSpecifier:
        return None
    spec.prereleases = prereleases
    specs = [sp for sp in spec]
    # We only have one version spec and it defines > or >=
    if len(specs) == 1 and specs[0].operator in (">", ">="):
        return True
    # One specifier is exact version
    if any(sp.operator in ("==") for sp in specs):
        return False
    has_upper = any(sp.operator in ("<", "<=") for sp in specs)
    has_lower = any(sp.operator in (">", ">=") for sp in specs)
    # We have a version spec that defines an upper and lower bound
    if has_upper and has_lower:
        return False
    # Everything else, like only an upper version, only a lower version etc.
    return True


def split_requirement(requirement: str) -> Tuple[str, str]:
    """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
    req = Requirement(requirement)
    return (req.name, str(req.specifier))


def get_minor_version_range(version: str) -> str:
    """Generate a version range like >=1.2.3,<1.3.0 based on a given version
    (e.g. of spaCy).
    """
    release = Version(version).release
    return f">={version},<{release[0]}.{release[1] + 1}.0"


def get_model_lower_version(constraint: str) -> Optional[str]:
    """From a version range like >=1.2.3,<1.3.0 return the lower pin."""
    try:
        specset = SpecifierSet(constraint)
        for spec in specset:
            if spec.operator in (">=", "==", "~="):
                return spec.version
    except Exception:
        pass
    return None


def get_base_version(version: str) -> str:
    """Generate the base version without any prerelease identifiers.

    version (str): The version, e.g. "3.0.0.dev1".
    RETURNS (str): The base version, e.g. "3.0.0".
    """
    return Version(version).base_version


def get_minor_version(version: str) -> Optional[str]:
    """Get the major + minor version (without patch or prerelease identifiers).

    version (str): The version.
    RETURNS (str): The major + minor version or None if version is invalid.
    """
    try:
        v = Version(version)
    except (TypeError, InvalidVersion):
        return None
    return f"{v.major}.{v.minor}"


def is_minor_version_match(version_a: str, version_b: str) -> bool:
    """Compare two versions and check if they match in major and minor, without
    patch or prerelease identifiers. Used internally for compatibility checks
    that should be insensitive to patch releases.

    version_a (str): The first version
    version_b (str): The second version.
    RETURNS (bool): Whether the versions match.
    """
    a = get_minor_version(version_a)
    b = get_minor_version(version_b)
    return a is not None and b is not None and a == b


def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
    """Load a model meta.json from a path and validate its contents.

    path (Union[str, Path]): Path to meta.json.
    RETURNS (Dict[str, Any]): The loaded meta.
    """
    path = ensure_path(path)
    if not path.parent.exists():
        raise IOError(Errors.E052.format(path=path.parent))
    if not path.exists() or not path.is_file():
        raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
    meta = srsly.read_json(path)
    for setting in ["lang", "name", "version"]:
        if setting not in meta or not meta[setting]:
            raise ValueError(Errors.E054.format(setting=setting))
    if "spacy_version" in meta:
        if not is_compatible_version(about.__version__, meta["spacy_version"]):
            lower_version = get_model_lower_version(meta["spacy_version"])
            lower_version = get_minor_version(lower_version)  # type: ignore[arg-type]
            if lower_version is not None:
                lower_version = "v" + lower_version
            elif "spacy_git_version" in meta:
                lower_version = "git commit " + meta["spacy_git_version"]
            else:
                lower_version = "version unknown"
            warn_msg = Warnings.W095.format(
                model=f"{meta['lang']}_{meta['name']}",
                model_version=meta["version"],
                version=lower_version,
                current=about.__version__,
            )
            warnings.warn(warn_msg)
        if is_unconstrained_version(meta["spacy_version"]):
            warn_msg = Warnings.W094.format(
                model=f"{meta['lang']}_{meta['name']}",
                model_version=meta["version"],
                version=meta["spacy_version"],
                example=get_minor_version_range(about.__version__),
            )
            warnings.warn(warn_msg)
    return meta


def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
    """Get model meta.json from a directory path and validate its contents.

    path (str / Path): Path to model directory.
    RETURNS (Dict[str, Any]): The model's meta data.
    """
    model_path = ensure_path(path)
    return load_meta(model_path / "meta.json")


def is_package(name: str) -> bool:
    """Check if string maps to a package installed via pip.

    name (str): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
    try:
        importlib_metadata.distribution(name)  # type: ignore[attr-defined]
        return True
    except:  # noqa: E722
        return False


def get_package_path(name: str) -> Path:
    """Get the path to an installed package.

    name (str): Package name.
    RETURNS (Path): Path to installed package.
    """
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    pkg = importlib.import_module(name)
    return Path(cast(Union[str, os.PathLike], pkg.__file__)).parent


def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
    """Replace a node within a model with a new one, updating refs.

    model (Model): The parent model.
    target (Model): The target node.
    replacement (Model): The node to replace the target with.
    """
    # Place the node into the sublayers
    for node in model.walk():
        if target in node.layers:
            node.layers[node.layers.index(target)] = replacement
    # Now fix any node references
    for node in model.walk():
        for ref_name in node.ref_names:
            if node.maybe_get_ref(ref_name) is target:
                node.set_ref(ref_name, replacement)


def split_command(command: str) -> List[str]:
    """Split a string command using shlex. Handles platform compatibility.

    command (str) : The command to split
    RETURNS (List[str]): The split command.
    """
    return shlex.split(command, posix=not is_windows)


def join_command(command: List[str]) -> str:
    """Join a command using shlex. shlex.join is only available for Python 3.8+,
    so we're using a workaround here.

    command (List[str]): The command to join.
    RETURNS (str): The joined command
    """
    return " ".join(shlex.quote(cmd) for cmd in command)


def run_command(
    command: Union[str, List[str]],
    *,
    stdin: Optional[Any] = None,
    capture: bool = False,
) -> subprocess.CompletedProcess:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.

    command (str / List[str]): The command. If provided as a string, the
        string will be split using shlex.split.
    stdin (Optional[Any]): stdin to read from or None.
    capture (bool): Whether to capture the output and errors. If False,
        the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    RETURNS (Optional[CompletedProcess]): The process object.
    """
    if isinstance(command, str):
        cmd_list = split_command(command)
        cmd_str = command
    else:
        cmd_list = command
        cmd_str = " ".join(command)
    try:
        ret = subprocess.run(
            cmd_list,
            env=os.environ.copy(),
            input=stdin,
            encoding="utf8",
            check=False,
            stdout=subprocess.PIPE if capture else None,
            stderr=subprocess.STDOUT if capture else None,
        )
    except FileNotFoundError:
        # Indicates the *command* wasn't found, it's an error before the command
        # is run.
        raise FileNotFoundError(
            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
        ) from None
    if ret.returncode != 0 and capture:
        message = f"Error running command:\n\n{cmd_str}\n\n"
        message += f"Subprocess exited with status {ret.returncode}"
        if ret.stdout is not None:
            message += f"\n\nProcess log (stdout and stderr):\n\n"
            message += ret.stdout
        error = subprocess.SubprocessError(message)
        error.ret = ret  # type: ignore[attr-defined]
        error.command = cmd_str  # type: ignore[attr-defined]
        raise error
    elif ret.returncode != 0:
        sys.exit(ret.returncode)
    return ret


@contextmanager
def working_dir(path: Union[str, Path]) -> Iterator[Path]:
    """Change current working directory and returns to previous on exit.

    path (str / Path): The directory to navigate to.
    YIELDS (Path): The absolute path to the current working directory. This
        should be used if the block needs to perform actions within the working
        directory, to prevent mismatches with relative paths.
    """
    prev_cwd = Path.cwd()
    current = Path(path).resolve()
    os.chdir(str(current))
    try:
        yield current
    finally:
        os.chdir(str(prev_cwd))


@contextmanager
def make_tempdir() -> Generator[Path, None, None]:
    """Execute a block in a temporary directory and remove the directory and
    its contents at the end of the with block.

    YIELDS (Path): The path of the temp directory.
    """
    d = Path(tempfile.mkdtemp())
    yield d
    try:
        shutil.rmtree(str(d))
    except PermissionError as e:
        warnings.warn(Warnings.W091.format(dir=d, msg=e))


def is_cwd(path: Union[Path, str]) -> bool:
    """Check whether a path is the current working directory.

    path (Union[Path, str]): The directory path.
    RETURNS (bool): Whether the path is the current working directory.
    """
    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()


def is_in_jupyter() -> bool:
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.
    RETURNS (bool): True if in Jupyter, False if not.
    """
    # https://stackoverflow.com/a/39662359/6400719
    try:
        shell = get_ipython().__class__.__name__  # type: ignore[name-defined]
        if shell == "ZMQInteractiveShell":
            return True  # Jupyter notebook or qtconsole
    except NameError:
        return False  # Probably standard Python interpreter
    return False


def get_object_name(obj: Any) -> str:
    """Get a human-readable name of a Python object, e.g. a pipeline component.

    obj (Any): The Python object, typically a function or class.
    RETURNS (str): A human-readable name.
    """
    if hasattr(obj, "name") and obj.name is not None:
        return obj.name
    if hasattr(obj, "__name__"):
        return obj.__name__
    if hasattr(obj, "__class__") and hasattr(obj.__class__, "__name__"):
        return obj.__class__.__name__
    return repr(obj)


def is_same_func(func1: Callable, func2: Callable) -> bool:
    """Approximately decide whether two functions are the same, even if their
    identity is different (e.g. after they have been live reloaded). Mostly
    used in the @Language.component and @Language.factory decorators to decide
    whether to raise if a factory already exists. Allows decorator to run
    multiple times with the same function.

    func1 (Callable): The first function.
    func2 (Callable): The second function.
    RETURNS (bool): Whether it's the same function (most likely).
    """
    if not callable(func1) or not callable(func2):
        return False
    if not hasattr(func1, "__qualname__") or not hasattr(func2, "__qualname__"):
        return False
    same_name = func1.__qualname__ == func2.__qualname__
    same_file = inspect.getfile(func1) == inspect.getfile(func2)
    same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2)
    return same_name and same_file and same_code


def get_cuda_stream(
    require: bool = False, non_blocking: bool = True
) -> Optional[CudaStream]:
    ops = get_current_ops()
    if CudaStream is None:
        return None
    elif isinstance(ops, NumpyOps):
        return None
    else:
        return CudaStream(non_blocking=non_blocking)


def get_async(stream, numpy_array):
    if cupy is None:
        return numpy_array
    else:
        array = cupy.ndarray(numpy_array.shape, order="C", dtype=numpy_array.dtype)
        array.set(numpy_array, stream=stream)
        return array


def read_regex(path: Union[str, Path]) -> Pattern:
    path = ensure_path(path)
    with path.open(encoding="utf8") as file_:
        entries = file_.read().split("\n")
    expression = "|".join(
        ["^" + re.escape(piece) for piece in entries if piece.strip()]
    )
    return re.compile(expression)


def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
    """Compile a sequence of prefix rules into a regex object.

    entries (Iterable[Union[str, Pattern]]): The prefix rules, e.g.
        spacy.lang.punctuation.TOKENIZER_PREFIXES.
    RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
    """
    expression = "|".join(["^" + piece for piece in entries if piece.strip()])  # type: ignore[operator, union-attr]
    return re.compile(expression)


def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
    """Compile a sequence of suffix rules into a regex object.

    entries (Iterable[Union[str, Pattern]]): The suffix rules, e.g.
        spacy.lang.punctuation.TOKENIZER_SUFFIXES.
    RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
    """
    expression = "|".join([piece + "$" for piece in entries if piece.strip()])  # type: ignore[operator, union-attr]
    return re.compile(expression)


def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
    """Compile a sequence of infix rules into a regex object.

    entries (Iterable[Union[str, Pattern]]): The infix rules, e.g.
        spacy.lang.punctuation.TOKENIZER_INFIXES.
    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
    """
    expression = "|".join([piece for piece in entries if piece.strip()])  # type: ignore[misc, union-attr]
    return re.compile(expression)


def add_lookups(default_func: Callable[[str], Any], *lookups) -> Callable[[str], Any]:
    """Extend an attribute function with special cases. If a word is in the
    lookups, the value is returned. Otherwise the previous function is used.

    default_func (callable): The default function to execute.
    *lookups (dict): Lookup dictionary mapping string to attribute value.
    RETURNS (callable): Lexical attribute getter.
    """
    # This is implemented as functools.partial instead of a closure, to allow
    # pickle to work.
    return functools.partial(_get_attr_unless_lookup, default_func, lookups)


def _get_attr_unless_lookup(
    default_func: Callable[[str], Any], lookups: Dict[str, Any], string: str
) -> Any:
    for lookup in lookups:
        if string in lookup:
            return lookup[string]  # type: ignore[index]
    return default_func(string)


def update_exc(
    base_exceptions: Dict[str, List[dict]], *addition_dicts
) -> Dict[str, List[dict]]:
    """Update and validate tokenizer exceptions. Will overwrite exceptions.

    base_exceptions (Dict[str, List[dict]]): Base exceptions.
    *addition_dicts (Dict[str, List[dict]]): Exceptions to add to the base dict, in order.
    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
    """
    exc = dict(base_exceptions)
    for additions in addition_dicts:
        for orth, token_attrs in additions.items():
            if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
                raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
            described_orth = "".join(attr[ORTH] for attr in token_attrs)
            if orth != described_orth:
                raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
        exc.update(additions)
    exc = expand_exc(exc, "'", "’")
    return exc


def expand_exc(
    excs: Dict[str, List[dict]], search: str, replace: str
) -> Dict[str, List[dict]]:
    """Find string in tokenizer exceptions, duplicate entry and replace string.
    For example, to add additional versions with typographic apostrophes.

    excs (Dict[str, List[dict]]): Tokenizer exceptions.
    search (str): String to find and replace.
    replace (str): Replacement.
    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
    """

    def _fix_token(token, search, replace):
        fixed = dict(token)
        fixed[ORTH] = fixed[ORTH].replace(search, replace)
        return fixed

    new_excs = dict(excs)
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            new_excs[new_key] = new_value
    return new_excs


def normalize_slice(
    length: int, start: int, stop: int, step: Optional[int] = None
) -> Tuple[int, int]:
    if not (step is None or step == 1):
        raise ValueError(Errors.E057)
    if start is None:
        start = 0
    elif start < 0:
        start += length
    start = min(length, max(0, start))
    if stop is None:
        stop = length
    elif stop < 0:
        stop += length
    stop = min(length, max(start, stop))
    return start, stop


def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
    creating named entities (where one token can only be part of one entity) or
    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
    longest span is preferred over shorter spans.

    spans (Iterable[Span]): The spans to filter.
    RETURNS (List[Span]): The filtered spans.
    """
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens: Set[int] = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
            seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result


def to_bytes(getters: Dict[str, Callable[[], bytes]], exclude: Iterable[str]) -> bytes:
    return srsly.msgpack_dumps(to_dict(getters, exclude))


def from_bytes(
    bytes_data: bytes,
    setters: Dict[str, Callable[[bytes], Any]],
    exclude: Iterable[str],
) -> None:
    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)  # type: ignore[return-value]


def to_dict(
    getters: Dict[str, Callable[[], Any]], exclude: Iterable[str]
) -> Dict[str, Any]:
    serialized = {}
    for key, getter in getters.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude:
            serialized[key] = getter()
    return serialized


def from_dict(
    msg: Dict[str, Any],
    setters: Dict[str, Callable[[Any], Any]],
    exclude: Iterable[str],
) -> Dict[str, Any]:
    for key, setter in setters.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude and key in msg:
            setter(msg[key])
    return msg


def to_disk(
    path: Union[str, Path],
    writers: Dict[str, Callable[[Path], None]],
    exclude: Iterable[str],
) -> Path:
    path = ensure_path(path)
    if not path.exists():
        path.mkdir()
    for key, writer in writers.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude:
            writer(path / key)
    return path


def from_disk(
    path: Union[str, Path],
    readers: Dict[str, Callable[[Path], None]],
    exclude: Iterable[str],
) -> Path:
    path = ensure_path(path)
    for key, reader in readers.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude:
            reader(path / key)
    return path


def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
    """Import module from a file. Used to load models from a directory.

    name (str): Name of module to load.
    loc (str / Path): Path to the file.
    RETURNS: The loaded module.
    """
    spec = importlib.util.spec_from_file_location(name, str(loc))
    module = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
    spec.loader.exec_module(module)  # type: ignore[union-attr]
    return module


def minify_html(html: str) -> str:
    """Perform a template-specific, rudimentary HTML minification for displaCy.
    Disclaimer: NOT a general-purpose solution, only removes indentation and
    newlines.

    html (str): Markup to minify.
    RETURNS (str): "Minified" HTML.
    """
    return html.strip().replace("    ", "").replace("\n", "")


def escape_html(text: str) -> str:
    """Replace <, >, &, " with their HTML encoded representation. Intended to
    prevent HTML errors in rendered displaCy markup.

    text (str): The original text.
    RETURNS (str): Equivalent text to be safely used within HTML.
    """
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    text = text.replace('"', "&quot;")
    return text


def get_words_and_spaces(
    words: Iterable[str], text: str
) -> Tuple[List[str], List[bool]]:
    """Given a list of words and a text, reconstruct the original tokens and
    return a list of words and spaces that can be used to create a Doc. This
    can help recover destructive tokenization that didn't preserve any
    whitespace information.

    words (Iterable[str]): The words.
    text (str): The original text.
    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
    """
    if "".join("".join(words).split()) != "".join(text.split()):
        raise ValueError(Errors.E194.format(text=text, words=words))
    text_words = []
    text_spaces = []
    text_pos = 0
    # normalize words to remove all whitespace tokens
    norm_words = [word for word in words if not word.isspace()]
    # align words with text
    for word in norm_words:
        try:
            word_start = text[text_pos:].index(word)
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words)) from None
        if word_start > 0:
            text_words.append(text[text_pos : text_pos + word_start])
            text_spaces.append(False)
            text_pos += word_start
        text_words.append(word)
        text_spaces.append(False)
        text_pos += len(word)
        if text_pos < len(text) and text[text_pos] == " ":
            text_spaces[-1] = True
            text_pos += 1
    if text_pos < len(text):
        text_words.append(text[text_pos:])
        text_spaces.append(False)
    return (text_words, text_spaces)


def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
    """Deep copy a Config. Will raise an error if the config contents are not
    JSON-serializable.

    config (Config): The config to copy.
    RETURNS (Config): The copied config.
    """
    try:
        return Config(config).copy()
    except ValueError:
        raise ValueError(Errors.E961.format(config=config)) from None


def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
    """Convert dot notation to a dict. For example: {"token.pos": True,
    "token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.

    values (Dict[str, Any]): The key/value pairs to convert.
    RETURNS (Dict[str, dict]): The converted values.
    """
    result: Dict[str, dict] = {}
    for key, value in values.items():
        path = result
        parts = key.lower().split(".")
        for i, item in enumerate(parts):
            is_last = i == len(parts) - 1
            path = path.setdefault(item, value if is_last else {})
    return result


def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
    """Convert dot notation to a dict. For example: {"token": {"pos": True,
    "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.

    values (Dict[str, dict]): The dict to convert.
    RETURNS (Dict[str, Any]): The key/value pairs.
    """
    return {".".join(key): value for key, value in walk_dict(obj)}


def dot_to_object(config: Config, section: str):
    """Convert dot notation of a "section" to a specific part of the Config.
    e.g. "training.optimizer" would return the Optimizer object.
    Throws an error if the section is not defined in this config.

    config (Config): The config.
    section (str): The dot notation of the section in the config.
    RETURNS: The object denoted by the section
    """
    component = config
    parts = section.split(".")
    for item in parts:
        try:
            component = component[item]
        except (KeyError, TypeError):
            raise KeyError(Errors.E952.format(name=section)) from None
    return component


def set_dot_to_object(config: Config, section: str, value: Any) -> None:
    """Update a config at a given position from a dot notation.

    config (Config): The config.
    section (str): The dot notation of the section in the config.
    value (Any): The value to set in the config.
    """
    component = config
    parts = section.split(".")
    for i, item in enumerate(parts):
        try:
            if i == len(parts) - 1:
                component[item] = value
            else:
                component = component[item]
        except (KeyError, TypeError):
            raise KeyError(Errors.E952.format(name=section)) from None


def walk_dict(
    node: Dict[str, Any], parent: List[str] = []
) -> Iterator[Tuple[List[str], Any]]:
    """Walk a dict and yield the path and values of the leaves."""
    for key, value in node.items():
        key_parent = [*parent, key]
        if isinstance(value, dict):
            yield from walk_dict(value, key_parent)
        else:
            yield (key_parent, value)


def get_arg_names(func: Callable) -> List[str]:
    """Get a list of all named arguments of a function (regular,
    keyword-only).

    func (Callable): The function
    RETURNS (List[str]): The argument names.
    """
    argspec = inspect.getfullargspec(func)
    return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))


def combine_score_weights(
    weights: List[Dict[str, Optional[float]]],
    overrides: Dict[str, Optional[float]] = SimpleFrozenDict(),
) -> Dict[str, Optional[float]]:
    """Combine and normalize score weights defined by components, e.g.
    {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.

    weights (List[dict]): The weights defined by the components.
    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
        should be preserved.
    RETURNS (Dict[str, float]): The combined and normalized weights.
    """
    # We divide each weight by the total weight sum.
    # We first need to extract all None/null values for score weights that
    # shouldn't be shown in the table *or* be weighted
    result: Dict[str, Optional[float]] = {
        key: value for w_dict in weights for (key, value) in w_dict.items()
    }
    result.update(overrides)
    weight_sum = sum([v if v else 0.0 for v in result.values()])
    for key, value in result.items():
        if value and weight_sum > 0:
            result[key] = round(value / weight_sum, 2)
    return result


class DummyTokenizer:
    def __call__(self, text):
        raise NotImplementedError

    def pipe(self, texts, **kwargs):
        for text in texts:
            yield self(text)

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **kwargs):
        return b""

    def from_bytes(self, data: bytes, **kwargs) -> "DummyTokenizer":
        return self

    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
        return None

    def from_disk(self, path: Union[str, Path], **kwargs) -> "DummyTokenizer":
        return self


def create_default_optimizer() -> Optimizer:
    return Adam()


def minibatch(items, size):
    """Iterate over batches of items. `size` may be an iterator,
    so that batch-size can vary on each step.
    """
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = list(itertools.islice(items, int(batch_size)))
        if len(batch) == 0:
            break
        yield list(batch)


def is_cython_func(func: Callable) -> bool:
    """Slightly hacky check for whether a callable is implemented in Cython.
    Can be used to implement slightly different behaviors, especially around
    inspecting and parameter annotations. Note that this will only return True
    for actual cdef functions and methods, not regular Python functions defined
    in Python modules.

    func (Callable): The callable to check.
    RETURNS (bool): Whether the callable is Cython (probably).
    """
    attr = "__pyx_vtable__"
    if hasattr(func, attr):  # function or class instance
        return True
    # https://stackoverflow.com/a/55767059
    if (
        hasattr(func, "__qualname__")
        and hasattr(func, "__module__")
        and func.__module__ in sys.modules
    ):  # method
        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
        return hasattr(cls_func, attr)
    return False


def check_bool_env_var(env_var: str) -> bool:
    """Convert the value of an environment variable to a boolean. Add special
    check for "0" (falsy) and consider everything else truthy, except unset.

    env_var (str): The name of the environment variable to check.
    RETURNS (bool): Its boolean value.
    """
    value = os.environ.get(env_var, False)
    if value == "0":
        return False
    return bool(value)


def _pipe(
    docs: Iterable["Doc"],
    proc: "Pipe",
    name: str,
    default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn],
    kwargs: Mapping[str, Any],
) -> Iterator["Doc"]:
    if hasattr(proc, "pipe"):
        yield from proc.pipe(docs, **kwargs)
    else:
        # We added some args for pipe that __call__ doesn't expect.
        kwargs = dict(kwargs)
        error_handler = default_error_handler
        if hasattr(proc, "get_error_handler"):
            error_handler = proc.get_error_handler()
        for arg in ["batch_size"]:
            if arg in kwargs:
                kwargs.pop(arg)
        for doc in docs:
            try:
                doc = proc(doc, **kwargs)  # type: ignore[call-arg]
                yield doc
            except Exception as e:
                error_handler(name, proc, [doc], e)


def raise_error(proc_name, proc, docs, e):
    raise e


def ignore_error(proc_name, proc, docs, e):
    pass


def warn_if_jupyter_cupy():
    """Warn about require_gpu if a jupyter notebook + cupy + mismatched
    contextvars vs. thread ops are detected
    """
    if is_in_jupyter():
        from thinc.backends.cupy_ops import CupyOps

        if CupyOps.xp is not None:
            from thinc.backends import contextvars_eq_thread_ops

            if not contextvars_eq_thread_ops():
                warnings.warn(Warnings.W111)


def check_lexeme_norms(vocab, component_name):
    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
        langs = ", ".join(LEXEME_NORM_LANGS)
        logger.debug(Warnings.W033.format(model=component_name, langs=langs))


def to_ternary_int(val) -> int:
    """Convert a value to the ternary 1/0/-1 int used for True/None/False in
    attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
    (None), any other values are -1 (False).
    """
    if val is True:
        return 1
    elif val is None:
        return 0
    elif val is False:
        return -1
    elif val == 1:
        return 1
    elif val == 0:
        return 0
    else:
        return -1


# The following implementation of packages_distributions() is adapted from
# importlib_metadata, which is distributed under the Apache 2.0 License.
# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
# See licenses/3rd_party_licenses.txt
def packages_distributions() -> Dict[str, List[str]]:
    """Return a mapping of top-level packages to their distributions. We're
    inlining this helper from the importlib_metadata "backport" here, since
    it's not available in the builtin importlib.metadata.
    """
    pkg_to_dist = defaultdict(list)
    for dist in importlib_metadata.distributions():
        for pkg in (dist.read_text("top_level.txt") or "").split():
            pkg_to_dist[pkg].append(dist.metadata["Name"])
    return dict(pkg_to_dist)
-												Fix issues for Mypy 0.950 and Pydantic 1.9.0 (#10786)

* Make changes to typing

* Correction

* Format with black

* Corrections based on review

* Bumped Thinc dependency version

* Bumped blis requirement

* Correction for older Python versions

* Update spacy/ml/models/textcat.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* Corrections based on review feedback

* Readd deleted docstring line

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
											
										
										
											2022-05-25 10:33:54 +03:00
+								from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								from typing import Optional, Iterable, Callable, Tuple, Type
-												Allow spacy project to push and pull to/from remote storage (#5949)

* Add utils for working with remote storage

* WIP add remote_cache for project

* WIP add push and pull commands

* Use pathy in remote_cache

* Updarte util

* Update remote_cache

* Update util

* Update project assets

* Update pull script

* Update push script

* Fix type annotation in util

* Work on remote storage

* Remove site and env hash

* Fix imports

* Fix type annotation

* Require pathy

* Require pathy

* Fix import

* Add a util to handle project variable substitution

* Import push and pull commands

* Fix pull command

* Fix push command

* Fix tarfile in remote_storage

* Improve printing

* Fiddle with status messages

* Set version to v3.0.0a9

* Draft docs for spacy project remote storages

* Update docs [ci skip]

* Use Thinc config to simplify and unify template variables

* Auto-format

* Don't import Pathy globally for now

Causes slow and annoying Google Cloud warning

* Tidy up test

* Tidy up and update tests

* Update to latest Thinc

* Update docs

* variables -> vars

* Update docs [ci skip]

* Update docs [ci skip]

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-23 19:32:09 +03:00
+								from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								from types import ModuleType
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								import os
-												Move is_package and get_model_package_path to util

											
										
										
											2017-05-08 00:24:51 +03:00
+								import importlib
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								import importlib.util
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 10:05:22 +03:00
+								import re
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								from pathlib import Path
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								import thinc
-												Update docs and types

											
										
										
											2020-07-31 18:02:54 +03:00
+								from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-												Add option to replace listeners for sourced components

											
										
										
											2021-01-29 07:57:04 +03:00
+								from thinc.api import ConfigValidationError, Model
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								import functools
-												Add missing import (fixes #1546)

											
										
										
											2017-11-10 21:05:18 +03:00
+								import itertools
-												Add contributor agreement for emulbreh

											
										
										
											2018-02-13 14:52:48 +03:00
+								import numpy.random
-												Use max(uint64) for OOV lexeme rank (#5303)

* Use max(uint64) for OOV lexeme rank

* Add test for default OOV rank

* Revert back to thinc==7.4.0

Requiring the updated version of thinc was unnecessary.

* Define OOV_RANK in one place

Define OOV_RANK in one place in `util`.

* Fix formatting [ci skip]

* Switch to external definitions of max(uint64)

Switch to external defintions of max(uint64) and confirm that they are
equal.
											
										
										
											2020-04-15 14:49:47 +03:00
+								import numpy
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								import srsly
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								import catalogue
-												Include available registry names in error

											
										
										
											2021-01-16 06:35:03 +03:00
+								from catalogue import RegistryError, Registry
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								import langcodes
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								import sys
-												Add missing import
											
										
										
											2020-04-28 15:01:29 +03:00
+								import warnings
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								from packaging.specifiers import SpecifierSet, InvalidSpecifier
 								from packaging.version import Version, InvalidVersion
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								from packaging.requirements import Requirement
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								import subprocess
 								from contextlib import contextmanager
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								from collections import defaultdict
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
+								import tempfile
 								import shutil
-												split_command util function

											
										
										
											2020-06-30 13:54:15 +03:00
+								import shlex
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								import inspect
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								import pkgutil
-												Replace lexeme_norm warning with logging

											
										
										
											2020-08-14 16:00:52 +03:00
+								import logging
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
-												Set cupy.random seed in fix_random_seed helper

											
										
										
											2018-12-08 14:37:38 +03:00
+								try:
 								    import cupy.random
 								except ImportError:
 								    cupy = None
-												Remove dead and/or deprecated code (#5710)

* Remove dead and/or deprecated code

* Remove n_threads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-06 14:06:25 +03:00
+								# These are functions that were previously (v2.x) available from spacy.util
 								# and have since moved to Thinc. We're importing them here so people's code
 								# doesn't break, but they should always be imported from Thinc from now on,
 								# not from spacy.util.
 								from thinc.api import fix_random_seed, compounding, decaying  # noqa: F401
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								from .symbols import ORTH
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								from .compat import cupy, CudaStream, is_windows, importlib_metadata
-												Add better error for failed model shortcut loading

											
										
										
											2020-08-06 00:10:29 +03:00
+								from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								from . import about
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								if TYPE_CHECKING:
 								    # This lets us add type hints for mypy etc. without causing circular imports
 								    from .language import Language  # noqa: F401
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    from .pipeline import Pipe  # noqa: F401
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    from .tokens import Doc, Span  # noqa: F401
 								    from .vocab import Vocab  # noqa: F401
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
-												Handle raw_input vs input in Python 2 and 3

											
										
										
											2017-03-21 00:48:32 +03:00
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								# fmt: off
-												Use max(uint64) for OOV lexeme rank (#5303)

* Use max(uint64) for OOV lexeme rank

* Add test for default OOV rank

* Revert back to thinc==7.4.0

Requiring the updated version of thinc was unnecessary.

* Define OOV_RANK in one place

Define OOV_RANK in one place in `util`.

* Fix formatting [ci skip]

* Switch to external definitions of max(uint64)

Switch to external defintions of max(uint64) and confirm that they are
equal.
											
										
										
											2020-04-15 14:49:47 +03:00
+								OOV_RANK = numpy.iinfo(numpy.uint64).max
-												Refactor CLI

											
										
										
											2020-09-28 16:09:59 +03:00
+								DEFAULT_OOV_PROB = -20
-												Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.

											
										
										
											2021-03-19 12:45:16 +03:00
+								LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
-												Fix references to config file in the docs & UX (#9961)

* doc fixes around config file

* fix typo

* clarify default
											
										
										
											2022-01-04 16:31:26 +03:00
+								# Default order of sections in the config file. Not all sections needs to exist,
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								# and additional sections are added at the end, in alphabetical order.
-												Update CLI and add [initialize] block

											
										
										
											2020-09-28 12:56:14 +03:00
+								CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								# fmt: on
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
-												Replace lexeme_norm warning with logging

											
										
										
											2020-08-14 16:00:52 +03:00
+								logger = logging.getLogger("spacy")
-												prevent the root logger from inialising

											
										
										
											2020-12-15 22:50:34 +03:00
+								logger_stream_handler = logging.StreamHandler()
-												Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.

											
										
										
											2021-03-19 12:45:16 +03:00
+								logger_stream_handler.setFormatter(
 								    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
 								)
-												prevent the root logger from inialising

											
										
										
											2020-12-15 22:50:34 +03:00
+								logger.addHandler(logger_stream_handler)
-												Replace lexeme_norm warning with logging

											
										
										
											2020-08-14 16:00:52 +03:00
-												Tidy up env vars [ci skip]

											
										
										
											2020-09-30 16:15:11 +03:00
+								class ENV_VARS:
 								    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
-												Enable commit check via env var

											
										
										
											2020-10-05 21:51:15 +03:00
+								    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
-												Tidy up env vars [ci skip]

											
										
										
											2020-09-30 16:15:11 +03:00
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								class registry(thinc.registry):
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    languages = catalogue.create("spacy", "languages", entry_points=True)
 								    architectures = catalogue.create("spacy", "architectures", entry_points=True)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True)
 								    lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    lookups = catalogue.create("spacy", "lookups", entry_points=True)
 								    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
-												registry.assets -> registry.misc

											
										
										
											2020-09-03 18:31:14 +03:00
+								    misc = catalogue.create("spacy", "misc", entry_points=True)
-												Add config callbacks for modifying nlp object before and after init (#5866)

* WIP: Concept for modifying nlp object before and after init

* Make callbacks return nlp object

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Raise if callbacks don't return correct type

* Rename, update types, add after_pipeline_creation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-05 20:47:54 +03:00
+								    # Callback functions used to manipulate nlp object etc.
-												Check for callbacks entry points

											
										
										
											2021-03-18 23:18:25 +03:00
+								    callbacks = catalogue.create("spacy", "callbacks", entry_points=True)
-												Create corpus iterator and batcher from registry during training (#5865)

* Move batchers into their own module (and registry)

* Update CLI

* Update Corpus and batcher

* Update tests

* Update one config

* Merge 'evaluation' block back under [training]

* Import batchers in gold __init__

* Fix batchers

* Update config

* Update schema

* Update util

* Don't assume train and dev are actually paths

* Update onto-joint config

* Fix missing import

* Format

* Format

* Update spacy/gold/corpus.py

Co-authored-by: Ines Montani <ines@ines.io>

* Fix name

* Update default config

* Fix get_length option in batchers

* Update test

* Add comment

* Pass path into Corpus

* Update docstring

* Update schema and configs

* Update config

* Fix test

* Fix paths

* Fix print

* Fix create_train_batches

* [training.read_train] -> [training.train_corpus]

* Update onto-joint config

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-04 16:09:37 +03:00
+								    batchers = catalogue.create("spacy", "batchers", entry_points=True)
 								    readers = catalogue.create("spacy", "readers", entry_points=True)
-												Support data augmentation in Corpus (#6155)

* Support data augmentation in Corpus

* Note initial docs for data augmentation

* Add augmenter to quickstart

* Fix flake8

* Format

* Fix test

* Update spacy/tests/training/test_training.py

* Improve data augmentation arguments

* Update templates

* Move randomization out into caller

* Refactor

* Update spacy/training/augment.py

* Update spacy/tests/training/test_training.py

* Fix augment

* Fix test
											
										
										
											2020-09-28 04:03:27 +03:00
+								    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
-												Weights & Biases logger for train CLI (#5971)

* quick test as part of train script

* train_logger in config, default ConsoleLogger in loggers catalogue

* entitiy typo

* add wandb_logger

* cleanup

* Update spacy/cli/train_logger.py

Co-authored-by: Ines Montani <ines@ines.io>

* move loggers to gold.loggers

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-26 16:24:33 +03:00
+								    loggers = catalogue.create("spacy", "loggers", entry_points=True)
-												Refactor scoring methods to use registered functions (#8766)

* Add scorer option to components

Add an optional `scorer` parameter to all pipeline components. If a
scoring function is provided, it overrides the default scoring method
for that component.

* Add registered scorers for all components

* Add `scorers` registry
* Move all scoring methods outside of components as independent
  functions and register
* Use the registered scoring methods as defaults in configs and inits

Additional:

* The scoring methods no longer have access to the full component, so
  use settings from `cfg` as default scorer options to handle settings
  such as `labels`, `threshold`, and `positive_label`
* The `attribute_ruler` scoring method no longer has access to the
  patterns, so all scoring methods are called
* Bug fix: `spancat` scoring method is updated to set `allow_overlap` to
  score overlapping spans correctly

* Update Russian lemmatizer to use direct score method

* Check type of cfg in Pipe.score

* Fix check

* Update spacy/pipeline/sentencizer.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove validate_examples from scoring functions

* Use Pipe.labels instead of Pipe.cfg["labels"]

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-08-10 16:13:39 +03:00
+								    scorers = catalogue.create("spacy", "scorers", entry_points=True)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    # These are factories registered via third-party packages and the
 								    # spacy_factories entry point. This registry only exists so we can easily
 								    # load them via the entry points. The "true" factories are added via the
 								    # Language.factory decorator (in the spaCy code base and user code) and those
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    # are the factories used to initialize components via registry.resolve.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    _entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
 								    factories = catalogue.create("spacy", "internal_factories")
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    # This is mostly used to get a list of all installed models in the current
 								    # environment. spaCy models packaged with `spacy package` will "advertise"
 								    # themselves via entry points.
 								    models = catalogue.create("spacy", "models", entry_points=True)
-												Add CLI registry (#6037)


											
										
										
											2020-09-08 16:23:34 +03:00
+								    cli = catalogue.create("spacy", "cli", entry_points=True)
-												Move lookup tables out of the core library (#4346)

* Add default to util.get_entry_point

* Tidy up entry points

* Read lookups from entry points

* Remove lookup tables and related tests

* Add lookups install option

* Remove lemmatizer tests

* Remove logic to process language data files

* Update setup.cfg

											
										
										
											2019-10-01 01:01:27 +03:00
-												Include available registry names in error

											
										
										
											2021-01-16 06:35:03 +03:00
+								    @classmethod
 								    def get_registry_names(cls) -> List[str]:
 								        """List all available registries."""
 								        names = []
 								        for name, value in inspect.getmembers(cls):
 								            if not name.startswith("_") and isinstance(value, Registry):
 								                names.append(name)
 								        return sorted(names)
-												Support spacy-legacy via the registry

											
										
										
											2021-01-15 13:42:40 +03:00
+								    @classmethod
 								    def get(cls, registry_name: str, func_name: str) -> Callable:
 								        """Get a registered function from the registry."""
 								        # We're overwriting this classmethod so we're able to provide more
 								        # specific error messages and implement a fallback to spacy-legacy.
 								        if not hasattr(cls, registry_name):
-												Include available registry names in error

											
										
										
											2021-01-16 06:35:03 +03:00
+								            names = ", ".join(cls.get_registry_names()) or "none"
-												Fix error code

											
										
										
											2021-01-18 03:43:45 +03:00
+								            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
-												Support spacy-legacy via the registry

											
										
										
											2021-01-15 13:42:40 +03:00
+								        reg = getattr(cls, registry_name)
 								        try:
 								            func = reg.get(func_name)
-												Raise RegistryError

											
										
										
											2021-01-16 04:57:13 +03:00
+								        except RegistryError:
-												Support spacy-legacy via the registry

											
										
										
											2021-01-15 13:42:40 +03:00
+								            if func_name.startswith("spacy."):
 								                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
 								                try:
 								                    return reg.get(legacy_name)
 								                except catalogue.RegistryError:
 								                    pass
 								            available = ", ".join(sorted(reg.get_all().keys())) or "none"
-												Raise RegistryError

											
										
										
											2021-01-16 04:57:13 +03:00
+								            raise RegistryError(
-												Support spacy-legacy via the registry

											
										
										
											2021-01-15 13:42:40 +03:00
+								                Errors.E893.format(
 								                    name=func_name, reg_name=registry_name, available=available
 								                )
 								            ) from None
 								        return func
-												Handle spacy-legacy in package CLI for dependencies (#9163)

* Handle spacy-legacy in package CLI for dependencies

* Implement legacy backoff in spacy registry.find

* Remove unused import

* Update and format test
											
										
										
											2021-09-08 12:46:40 +03:00
+								    @classmethod
 								    def find(cls, registry_name: str, func_name: str) -> Callable:
 								        """Get info about a registered function from the registry."""
 								        # We're overwriting this classmethod so we're able to provide more
 								        # specific error messages and implement a fallback to spacy-legacy.
 								        if not hasattr(cls, registry_name):
 								            names = ", ".join(cls.get_registry_names()) or "none"
 								            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
 								        reg = getattr(cls, registry_name)
 								        try:
 								            func_info = reg.find(func_name)
 								        except RegistryError:
 								            if func_name.startswith("spacy."):
 								                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
 								                try:
 								                    return reg.find(legacy_name)
 								                except catalogue.RegistryError:
 								                    pass
 								            available = ", ".join(sorted(reg.get_all().keys())) or "none"
 								            raise RegistryError(
 								                Errors.E893.format(
 								                    name=func_name, reg_name=registry_name, available=available
 								                )
 								            ) from None
 								        return func_info
-												Support spacy-legacy via the registry

											
										
										
											2021-01-15 13:42:40 +03:00
+								    @classmethod
 								    def has(cls, registry_name: str, func_name: str) -> bool:
 								        """Check whether a function is available in a registry."""
 								        if not hasattr(cls, registry_name):
 								            return False
 								        reg = getattr(cls, registry_name)
 								        if func_name.startswith("spacy."):
 								            legacy_name = func_name.replace("spacy.", "spacy-legacy.")
 								            return func_name in reg or legacy_name in reg
 								        return func_name in reg
-												Move lookup tables out of the core library (#4346)

* Add default to util.get_entry_point

* Tidy up entry points

* Read lookups from entry points

* Remove lookup tables and related tests

* Add lookups install option

* Remove lemmatizer tests

* Remove logic to process language data files

* Update setup.cfg

											
										
										
											2019-10-01 01:01:27 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								class SimpleFrozenDict(dict):
 								    """Simplified implementation of a frozen dict, mainly used as default
 								    function or method argument (for arguments that should default to empty
 								    dictionary). Will raise an error if user or spaCy attempts to add to dict.
 								    """
 								    def __init__(self, *args, error: str = Errors.E095, **kwargs) -> None:
 								        """Initialize the frozen dict. Can be initialized with pre-defined
 								        values.
 								        error (str): The error message when user tries to assign to dict.
 								        """
 								        super().__init__(*args, **kwargs)
 								        self.error = error
 								    def __setitem__(self, key, value):
 								        raise NotImplementedError(self.error)
 								    def pop(self, key, default=None):
 								        raise NotImplementedError(self.error)
 								    def update(self, other):
 								        raise NotImplementedError(self.error)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								class SimpleFrozenList(list):
 								    """Wrapper class around a list that lets us raise custom errors if certain
 								    attributes/methods are accessed. Mostly used for properties like
 								    Language.pipeline that return an immutable list (and that we don't want to
 								    convert to a tuple to not break too much backwards compatibility). If a user
 								    accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
 								    """
 								    def __init__(self, *args, error: str = Errors.E927) -> None:
 								        """Initialize the frozen list.
 								        error (str): The error message when user tries to mutate the list.
 								        """
 								        self.error = error
 								        super().__init__(*args)
 								    def append(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def clear(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def extend(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def insert(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def pop(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def remove(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def reverse(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def sort(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def lang_class_is_loaded(lang: str) -> bool:
-												Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system

											
										
										
											2019-03-11 17:23:20 +03:00
+								    """Check whether a Language class is already loaded. Language classes are
 								    loaded lazily, to avoid expensive setup code associated with the language
 								    data.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    lang (str): Two-letter language code, e.g. 'en'.
-												Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system

											
										
										
											2019-03-11 17:23:20 +03:00
+								    RETURNS (bool): Whether a Language class has been loaded.
 								    """
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    return lang in registry.languages
-												Auto-format [ci skip]

											
										
										
											2019-03-11 19:10:50 +03:00
-												Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system

											
										
										
											2019-03-11 17:23:20 +03:00
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								def find_matching_language(lang: str) -> Optional[str]:
 								    """
 								    Given an IETF language code, find a supported spaCy language that is a
 								    close match for it (according to Unicode CLDR language-matching rules).
 								    This allows for language aliases, ISO 639-2 codes, more detailed language
 								    tags, and close matches.
 								    Returns the language code if a matching language is available, or None
 								    if there is no matching language.
 								    >>> find_matching_language('en')
 								    'en'
 								    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
 								    'pt'
 								    >>> find_matching_language('fra')  # an ISO 639-2 code for French
 								    'fr'
 								    >>> find_matching_language('iw')  # obsolete alias for Hebrew
 								    'he'
 								    >>> find_matching_language('no')  # Norwegian
 								    'nb'
 								    >>> find_matching_language('mo')  # old code for ro-MD
 								    'ro'
 								    >>> find_matching_language('zh-Hans')  # Simplified Chinese
 								    'zh'
 								    >>> find_matching_language('zxx')
 								    None
 								    """
 								    import spacy.lang  # noqa: F401
-												Format (#9630)


											
										
										
											2021-11-05 11:56:26 +03:00
 								    if lang == "xx":
 								        return "xx"
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
 								    # Find out which language modules we have
 								    possible_languages = []
-												Fix issues for Mypy 0.950 and Pydantic 1.9.0 (#10786)

* Make changes to typing

* Correction

* Format with black

* Corrections based on review

* Bumped Thinc dependency version

* Bumped blis requirement

* Correction for older Python versions

* Update spacy/ml/models/textcat.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* Corrections based on review feedback

* Readd deleted docstring line

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
											
										
										
											2022-05-25 10:33:54 +03:00
+								    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined]
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								        code = modinfo.name
-												Format (#9630)


											
										
										
											2021-11-05 11:56:26 +03:00
+								        if code == "xx":
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								            # Temporarily make 'xx' into a valid language code
-												Format (#9630)


											
										
										
											2021-11-05 11:56:26 +03:00
+								            possible_languages.append("mul")
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								        elif langcodes.tag_is_valid(code):
 								            possible_languages.append(code)
 								    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
 								    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
 								    # more possibilities, like variants of Chinese like 'wuu', but text that
 								    # is labeled that way is probably trying to be distinct from 'zh' and
 								    # shouldn't automatically match.
-												Format (#9630)


											
										
										
											2021-11-05 11:56:26 +03:00
+								    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
 								    if match == "mul":
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								        # Convert 'mul' back to spaCy's 'xx'
-												Format (#9630)


											
										
										
											2021-11-05 11:56:26 +03:00
+								        return "xx"
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								    else:
 								        return match
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								def get_lang_class(lang: str) -> Type["Language"]:
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    """Import and load a Language class.
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								    lang (str): IETF language code, such as 'en'.
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    RETURNS (Language): Language class.
 								    """
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    # Check if language is registered / entry point is available
 								    if lang in registry.languages:
 								        return registry.languages.get(lang)
 								    else:
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								        # Find the language in the spacy.lang subpackage
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								        try:
-												More formatting changes

											
										
										
											2019-12-25 19:59:52 +03:00
+								            module = importlib.import_module(f".lang.{lang}", "spacy")
-												Also raise original error message in util.get_lang_class

Otherwise, the true error that happens within a Language subclass is swallowed, because if it's imported lazily like that, it'll always be an ImportError

											
										
										
											2019-02-13 18:52:25 +03:00
+								        except ImportError as err:
-												Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
											
										
										
											2021-10-05 10:52:22 +03:00
+								            # Find a matching language. For example, if the language 'no' is
 								            # requested, we can use language-matching to load `spacy.lang.nb`.
 								            try:
 								                match = find_matching_language(lang)
 								            except langcodes.tag_parser.LanguageTagError:
 								                # proceed to raising an import error
 								                match = None
 								            if match:
 								                lang = match
 								                module = importlib.import_module(f".lang.{lang}", "spacy")
 								            else:
 								                raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        set_lang_class(lang, getattr(module, module.__all__[0]))  # type: ignore[attr-defined]
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    return registry.languages.get(lang)
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def set_lang_class(name: str, cls: Type["Language"]) -> None:
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    """Set a custom Language class name that can be loaded via get_lang_class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Name of Language class.
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    cls (Language): Language class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    registry.languages.register(name, func=cls)
-												Add load_lang_class() util function

											
										
										
											2017-05-09 00:50:45 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def ensure_path(path: Any) -> Any:
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    """Ensure string is converted to a Path.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    path (Any): Anything. If string, it's converted to Path.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    RETURNS: Path or original argument.
 								    """
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    if isinstance(path, str):
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								        return Path(path)
 								    else:
 								        return path
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def load_language_data(path: Union[str, Path]) -> Union[dict, list]:
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    """Load JSON language data using the given path as a base. If the provided
 								    path isn't present, will attempt to load a gzipped version before giving up.
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    path (str / Path): The data to load.
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    RETURNS: The loaded data.
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
+								    """
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    path = ensure_path(path)
 								    if path.exists():
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
+								        return srsly.read_json(path)
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    path = path.with_suffix(path.suffix + ".gz")
 								    if path.exists():
 								        return srsly.read_gzip_json(path)
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    raise ValueError(Errors.E160.format(path=path))
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_module_path(module: ModuleType) -> Path:
 								    """Get the path of a Python module.
 								    module (ModuleType): The Python module.
 								    RETURNS (Path): The path.
 								    """
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    if not hasattr(module, "__module__"):
-												Improve Morphology errors (#4314)

* Improve Morphology errors

* Also clean up some other errors

* Update errors.py

											
										
										
											2019-09-21 15:37:06 +03:00
+								        raise ValueError(Errors.E169.format(module=repr(module)))
-												Fix issues for Mypy 0.950 and Pydantic 1.9.0 (#10786)

* Make changes to typing

* Correction

* Format with black

* Corrections based on review

* Bumped Thinc dependency version

* Bumped blis requirement

* Correction for older Python versions

* Update spacy/ml/models/textcat.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* Corrections based on review feedback

* Readd deleted docstring line

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
											
										
										
											2022-05-25 10:33:54 +03:00
+								    file_path = Path(cast(os.PathLike, sys.modules[module.__module__].__file__))
 								    return file_path.parent
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model(
 								    name: Union[str, Path],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Remove symlinks, data dir and related stuff

											
										
										
											2020-02-18 19:20:17 +03:00
+								    """Load a model from a package or data path.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Package name or model path.
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Re-add setting for vocab data and tidy up

											
										
										
											2020-07-25 13:14:28 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable.
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
-												Re-add setting for vocab data and tidy up

											
										
										
											2020-07-25 13:14:28 +03:00
+								    RETURNS (Language): The loaded nlp object.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
-												Remove symlinks, data dir and related stuff

											
										
										
											2020-02-18 19:20:17 +03:00
+								    if isinstance(name, str):  # name or string path
-												 Add blank:{lang} shortcut to util.load_mode

											
										
										
											2020-05-21 21:24:07 +03:00
+								        if name.startswith("blank:"):  # shortcut for blank model
 								            return get_lang_class(name.replace("blank:", ""))()
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        if is_package(name):  # installed as package
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								            return load_model_from_package(name, **kwargs)  # type: ignore[arg-type]
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        if Path(name).exists():  # path to model data directory
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								            return load_model_from_path(Path(name), **kwargs)  # type: ignore[arg-type]
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    elif hasattr(name, "exists"):  # Path or Path-like to model data
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        return load_model_from_path(name, **kwargs)  # type: ignore[arg-type]
-												Add better error for failed model shortcut loading

											
										
										
											2020-08-06 00:10:29 +03:00
+								    if name in OLD_MODEL_SHORTCUTS:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))  # type: ignore[index]
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								    raise IOError(Errors.E050.format(name=name))
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_package(
 								    name: str,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    """Load a model from an installed package.
 								    name (str): The package name.
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
 								    RETURNS (Language): The loaded nlp object.
 								    """
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    cls = importlib.import_module(name)
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)  # type: ignore[attr-defined]
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_path(
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    model_path: Path,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    meta: Optional[Dict[str, Any]] = None,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    """Load a model from a data directory path. Creates Language class with
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    pipeline from config.cfg and then calls from_disk() with path.
-												Fix references to config file in the docs & UX (#9961)

* doc fixes around config file

* fix typo

* clarify default
											
										
										
											2022-01-04 16:31:26 +03:00
+								    model_path (Path): Model path.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    meta (Dict[str, Any]): Optional model meta.
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
 								    RETURNS (Language): The loaded nlp object.
 								    """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    if not model_path.exists():
 								        raise IOError(Errors.E052.format(path=model_path))
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    if not meta:
 								        meta = get_model_meta(model_path)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    config_path = model_path / "config.cfg"
-												Address missing config overrides post load of models (#8208)


											
										
										
											2021-05-31 11:36:52 +03:00
+								    overrides = dict_to_dot(config)
 								    config = load_config(config_path, overrides=overrides)
-												hook up meta in load_model_from_config (#10400)


											
										
										
											2022-03-04 13:07:45 +03:00
+								    nlp = load_model_from_config(
 								        config, vocab=vocab, disable=disable, exclude=exclude, meta=meta
 								    )
-												Address missing config overrides post load of models (#8208)


											
										
										
											2021-05-31 11:36:52 +03:00
+								    return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_config(
 								    config: Union[Dict[str, Any], Config],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
-												hook up meta in load_model_from_config (#10400)


											
										
										
											2022-03-04 13:07:45 +03:00
+								    meta: Dict[str, Any] = SimpleFrozenDict(),
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    auto_fill: bool = False,
 								    validate: bool = True,
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								) -> "Language":
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    """Create an nlp object from a config. Expects the full config file including
 								    a section "nlp" containing the settings for the nlp object.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
 								    name (str): Package name or model path.
 								    meta (Dict[str, Any]): Optional model meta.
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    auto_fill (bool): Whether to auto-fill config with missing defaults.
 								    validate (bool): Whether to show config validation errors.
 								    RETURNS (Language): The loaded nlp object.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    """
 								    if "nlp" not in config:
 								        raise ValueError(Errors.E985.format(config=config))
 								    nlp_config = config["nlp"]
-												util function dot_to_object and corresponding unit test

											
										
										
											2020-07-27 18:50:12 +03:00
+								    if "lang" not in nlp_config or nlp_config["lang"] is None:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        raise ValueError(Errors.E993.format(config=nlp_config))
 								    # This will automatically handle all codes registered via the languages
 								    # registry, including custom subclasses provided via entry points
 								    lang_cls = get_lang_class(nlp_config["lang"])
 								    nlp = lang_cls.from_config(
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        config,
 								        vocab=vocab,
 								        disable=disable,
 								        exclude=exclude,
 								        auto_fill=auto_fill,
 								        validate=validate,
-												hook up meta in load_model_from_config (#10400)


											
										
										
											2022-03-04 13:07:45 +03:00
+								        meta=meta,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    )
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    return nlp
-												Move replacement logic to Language.from_config

											
										
										
											2021-01-29 11:37:04 +03:00
+								def get_sourced_components(
 								    config: Union[Dict[str, Any], Config]
 								) -> Dict[str, Dict[str, Any]]:
 								    """RETURNS (List[str]): All sourced components in the original config,
 								    e.g. {"source": "en_core_web_sm"}. If the config contains a key
 								    "factory", we assume it refers to a component factory.
 								    """
 								    return {
 								        name: cfg
 								        for name, cfg in config.get("components", {}).items()
 								        if "factory" not in cfg and "source" in cfg
 								    }
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								def resolve_dot_names(
 								    config: Config, dot_names: List[Optional[str]]
 								) -> Tuple[Any, ...]:
-												Update vocab init

											
										
										
											2020-09-28 12:30:18 +03:00
+								    """Resolve one or more "dot notation" names, e.g. corpora.train.
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								    The paths could point anywhere into the config, so we don't know which
 								    top-level section we'll be looking within.
-												Update vocab init

											
										
										
											2020-09-28 12:30:18 +03:00
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								    We resolve the whole top-level section, although we could resolve less --
 								    we could find the lowest part of the tree.
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    """
-												Refactor CLI

											
										
										
											2020-09-28 16:09:59 +03:00
+								    # TODO: include schema?
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								    resolved = {}
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    output: List[Any] = []
-												Refactor CLI

											
										
										
											2020-09-28 16:09:59 +03:00
+								    errors = []
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								    for name in dot_names:
 								        if name is None:
 								            output.append(name)
 								        else:
 								            section = name.split(".")[0]
-												Refactor CLI

											
										
										
											2020-09-28 16:09:59 +03:00
+								            # We want to avoid resolving the same thing twice
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								            if section not in resolved:
-												Fix small issues, resolve_dot_names and debug model

											
										
										
											2020-09-29 21:38:35 +03:00
+								                if registry.is_promise(config[section]):
 								                    # Otherwise we can't resolve [corpus] if it's a promise
 								                    result = registry.resolve({"config": config[section]})["config"]
 								                else:
 								                    result = registry.resolve(config[section])
 								                resolved[section] = result
-												Refactor CLI

											
										
										
											2020-09-28 16:09:59 +03:00
+								            try:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								                output.append(dot_to_object(resolved, name))  # type: ignore[arg-type]
-												Refactor CLI

											
										
										
											2020-09-28 16:09:59 +03:00
+								            except KeyError:
 								                msg = f"not a valid section reference: {name}"
 								                errors.append({"loc": name.split("."), "msg": msg})
 								    if errors:
 								        raise ConfigValidationError(config=config, errors=errors)
-												Fix small issues, resolve_dot_names and debug model

											
										
										
											2020-09-29 21:38:35 +03:00
+								    return tuple(output)
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_init_py(
 								    init_file: Union[Path, str],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """Helper function to use in the `load()` method of a model package's
 								    __init__.py.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
 								    RETURNS (Language): The loaded nlp object.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """
 								    model_path = Path(init_file).parent
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    meta = get_model_meta(model_path)
-												More formatting changes

											
										
										
											2019-12-25 19:59:52 +03:00
+								    data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    data_path = model_path / data_dir
 								    if not model_path.exists():
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        raise IOError(Errors.E052.format(path=data_path))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    return load_model_from_path(
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        data_path,
 								        vocab=vocab,
 								        meta=meta,
 								        disable=disable,
 								        exclude=exclude,
 								        config=config,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    )
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								def load_config(
 								    path: Union[str, Path],
 								    overrides: Dict[str, Any] = SimpleFrozenDict(),
 								    interpolate: bool = False,
 								) -> Config:
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    """Load a config file. Takes care of path validation and section order.
-												Allow reading config from sdtin in spacy train

											
										
										
											2020-12-08 10:01:40 +03:00
+								    path (Union[str, Path]): Path to the config file or "-" to read from stdin.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    overrides: (Dict[str, Any]): Config overrides as nested dict or
 								        dict keyed by section values in dot notation.
 								    interpolate (bool): Whether to interpolate and resolve variables.
 								    RETURNS (Config): The loaded config.
 								    """
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    config_path = ensure_path(path)
-												Allow reading config from sdtin in spacy train

											
										
										
											2020-12-08 10:01:40 +03:00
+								    config = Config(section_order=CONFIG_SECTION_ORDER)
 								    if str(config_path) == "-":  # read from standard input
 								        return config.from_str(
 								            sys.stdin.read(), overrides=overrides, interpolate=interpolate
 								        )
 								    else:
-												Fix references to config file in the docs & UX (#9961)

* doc fixes around config file

* fix typo

* clarify default
											
										
										
											2022-01-04 16:31:26 +03:00
+								        if not config_path or not config_path.is_file():
 								            raise IOError(Errors.E053.format(path=config_path, name="config file"))
-												Allow reading config from sdtin in spacy train

											
										
										
											2020-12-08 10:01:40 +03:00
+								        return config.from_disk(
 								            config_path, overrides=overrides, interpolate=interpolate
 								        )
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
 								def load_config_from_str(
 								    text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
 								):
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    """Load a full config from a string. Wrapper around Thinc's Config.from_str.
 								    text (str): The string config to load.
 								    interpolate (bool): Whether to interpolate and resolve variables.
 								    RETURNS (Config): The loaded config.
 								    """
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    return Config(section_order=CONFIG_SECTION_ORDER).from_str(
-												Tidy up and auto-format

											
										
										
											2020-09-29 22:39:28 +03:00
+								        text, overrides=overrides, interpolate=interpolate
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    )
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_installed_models() -> List[str]:
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """List all model packages currently installed in the environment.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    RETURNS (List[str]): The string names of the models.
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """
 								    return list(registry.models.get_all().keys())
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_package_version(name: str) -> Optional[str]:
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """Get the version of an installed package. Typically used to get model
 								    package versions.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): The name of the installed Python package.
 								    RETURNS (str / None): The version or None if package not installed.
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """
 								    try:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        return importlib_metadata.version(name)  # type: ignore[attr-defined]
 								    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								        return None
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_compatible_version(
 								    version: str, constraint: str, prereleases: bool = True
 								) -> Optional[bool]:
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    """Check if a version (e.g. "2.0.0") is compatible given a version
 								    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
 								    it's interpreted as =={version}.
 								    version (str): The version to check.
 								    constraint (str): The constraint string.
 								    prereleases (bool): Whether to allow prereleases. If set to False,
 								        prerelease versions will be considered incompatible.
 								    RETURNS (bool / None): Whether the version is compatible, or None if the
 								        version or constraint are invalid.
 								    """
 								    # Handle cases where exact version is provided as constraint
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    if constraint[0].isdigit():
 								        constraint = f"=={constraint}"
 								    try:
 								        spec = SpecifierSet(constraint)
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        version = Version(version)  # type: ignore[assignment]
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    except (InvalidSpecifier, InvalidVersion):
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								        return None
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    spec.prereleases = prereleases
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    return version in spec
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_unconstrained_version(
 								    constraint: str, prereleases: bool = True
 								) -> Optional[bool]:
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								    # We have an exact version, this is the ultimate constrained version
 								    if constraint[0].isdigit():
 								        return False
 								    try:
 								        spec = SpecifierSet(constraint)
 								    except InvalidSpecifier:
 								        return None
 								    spec.prereleases = prereleases
 								    specs = [sp for sp in spec]
 								    # We only have one version spec and it defines > or >=
 								    if len(specs) == 1 and specs[0].operator in (">", ">="):
 								        return True
 								    # One specifier is exact version
 								    if any(sp.operator in ("==") for sp in specs):
 								        return False
 								    has_upper = any(sp.operator in ("<", "<=") for sp in specs)
 								    has_lower = any(sp.operator in (">", ">=") for sp in specs)
 								    # We have a version spec that defines an upper and lower bound
 								    if has_upper and has_lower:
 								        return False
 								    # Everything else, like only an upper version, only a lower version etc.
 								    return True
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								def split_requirement(requirement: str) -> Tuple[str, str]:
 								    """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
 								    req = Requirement(requirement)
 								    return (req.name, str(req.specifier))
 								def get_minor_version_range(version: str) -> str:
 								    """Generate a version range like >=1.2.3,<1.3.0 based on a given version
 								    (e.g. of spaCy).
-												WIP: improve model version deps

											
										
										
											2020-05-28 13:51:37 +03:00
+								    """
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								    release = Version(version).release
 								    return f">={version},<{release[0]}.{release[1] + 1}.0"
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
-												Use minor version for compatibility check (#8403)

* Use minor version for compatibility check

* Use minor version of compatibility table
* Soften warning message about incompatible models
* Add test for presence of current version in compatibility table

* Add test for download compatibility table

* Use minor version of lower pin in error message if possible

* Fall back to spacy_git_version if available

* Fix unknown version string
											
										
										
											2021-06-21 10:39:22 +03:00
+								def get_model_lower_version(constraint: str) -> Optional[str]:
-												Tidy up code

											
										
										
											2021-06-28 12:48:00 +03:00
+								    """From a version range like >=1.2.3,<1.3.0 return the lower pin."""
-												Use minor version for compatibility check (#8403)

* Use minor version for compatibility check

* Use minor version of compatibility table
* Soften warning message about incompatible models
* Add test for presence of current version in compatibility table

* Add test for download compatibility table

* Use minor version of lower pin in error message if possible

* Fall back to spacy_git_version if available

* Fix unknown version string
											
										
										
											2021-06-21 10:39:22 +03:00
+								    try:
 								        specset = SpecifierSet(constraint)
 								        for spec in specset:
 								            if spec.operator in (">=", "==", "~="):
 								                return spec.version
 								    except Exception:
 								        pass
 								    return None
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_base_version(version: str) -> str:
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    """Generate the base version without any prerelease identifiers.
 								    version (str): The version, e.g. "3.0.0.dev1".
 								    RETURNS (str): The base version, e.g. "3.0.0".
 								    """
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    return Version(version).base_version
-												WIP: improve model version deps

											
										
										
											2020-05-28 13:51:37 +03:00
-												Include spaCy version check in project CLI

											
										
										
											2020-10-05 14:53:07 +03:00
+								def get_minor_version(version: str) -> Optional[str]:
 								    """Get the major + minor version (without patch or prerelease identifiers).
 								    version (str): The version.
 								    RETURNS (str): The major + minor version or None if version is invalid.
 								    """
 								    try:
 								        v = Version(version)
 								    except (TypeError, InvalidVersion):
 								        return None
 								    return f"{v.major}.{v.minor}"
 								def is_minor_version_match(version_a: str, version_b: str) -> bool:
 								    """Compare two versions and check if they match in major and minor, without
 								    patch or prerelease identifiers. Used internally for compatibility checks
 								    that should be insensitive to patch releases.
 								    version_a (str): The first version
 								    version_b (str): The second version.
 								    RETURNS (bool): Whether the versions match.
 								    """
 								    a = get_minor_version(version_a)
 								    b = get_minor_version(version_b)
 								    return a is not None and b is not None and a == b
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
 								    """Load a model meta.json from a path and validate its contents.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    path (Union[str, Path]): Path to meta.json.
 								    RETURNS (Dict[str, Any]): The loaded meta.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    path = ensure_path(path)
 								    if not path.parent.exists():
 								        raise IOError(Errors.E052.format(path=path.parent))
 								    if not path.exists() or not path.is_file():
-												fix get_dim calls in build_simple_cnn_text_classifier

											
										
										
											2020-10-09 16:40:58 +03:00
+								        raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    meta = srsly.read_json(path)
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    for setting in ["lang", "name", "version"]:
-												Add more validation for model meta

											
										
										
											2017-08-29 12:21:44 +03:00
+								        if setting not in meta or not meta[setting]:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E054.format(setting=setting))
-												Add rudimentary version checks on model load

											
										
										
											2020-06-02 18:23:16 +03:00
+								    if "spacy_version" in meta:
-												Add warning

											
										
										
											2020-05-30 16:34:54 +03:00
+								        if not is_compatible_version(about.__version__, meta["spacy_version"]):
-												Use minor version for compatibility check (#8403)

* Use minor version for compatibility check

* Use minor version of compatibility table
* Soften warning message about incompatible models
* Add test for presence of current version in compatibility table

* Add test for download compatibility table

* Use minor version of lower pin in error message if possible

* Fall back to spacy_git_version if available

* Fix unknown version string
											
										
										
											2021-06-21 10:39:22 +03:00
+								            lower_version = get_model_lower_version(meta["spacy_version"])
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								            lower_version = get_minor_version(lower_version)  # type: ignore[arg-type]
-												Use minor version for compatibility check (#8403)

* Use minor version for compatibility check

* Use minor version of compatibility table
* Soften warning message about incompatible models
* Add test for presence of current version in compatibility table

* Add test for download compatibility table

* Use minor version of lower pin in error message if possible

* Fall back to spacy_git_version if available

* Fix unknown version string
											
										
										
											2021-06-21 10:39:22 +03:00
+								            if lower_version is not None:
 								                lower_version = "v" + lower_version
 								            elif "spacy_git_version" in meta:
 								                lower_version = "git commit " + meta["spacy_git_version"]
 								            else:
 								                lower_version = "version unknown"
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								            warn_msg = Warnings.W095.format(
 								                model=f"{meta['lang']}_{meta['name']}",
-												Add rudimentary version checks on model load

											
										
										
											2020-06-02 18:23:16 +03:00
+								                model_version=meta["version"],
-												Use minor version for compatibility check (#8403)

* Use minor version for compatibility check

* Use minor version of compatibility table
* Soften warning message about incompatible models
* Add test for presence of current version in compatibility table

* Add test for download compatibility table

* Use minor version of lower pin in error message if possible

* Fall back to spacy_git_version if available

* Fix unknown version string
											
										
										
											2021-06-21 10:39:22 +03:00
+								                version=lower_version,
-												Add rudimentary version checks on model load

											
										
										
											2020-06-02 18:23:16 +03:00
+								                current=about.__version__,
 								            )
 								            warnings.warn(warn_msg)
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								        if is_unconstrained_version(meta["spacy_version"]):
 								            warn_msg = Warnings.W094.format(
 								                model=f"{meta['lang']}_{meta['name']}",
 								                model_version=meta["version"],
 								                version=meta["spacy_version"],
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								                example=get_minor_version_range(about.__version__),
-												Add warning

											
										
										
											2020-05-30 16:34:54 +03:00
+								            )
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								            warnings.warn(warn_msg)
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    return meta
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
 								    """Get model meta.json from a directory path and validate its contents.
 								    path (str / Path): Path to model directory.
 								    RETURNS (Dict[str, Any]): The model's meta data.
 								    """
 								    model_path = ensure_path(path)
 								    return load_meta(model_path / "meta.json")
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_package(name: str) -> bool:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Check if string maps to a package installed via pip.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Name of package.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    RETURNS (bool): True if installed package, False if not.
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    """
-												Simplify is_package check

											
										
										
											2020-05-24 15:48:56 +03:00
+								    try:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        importlib_metadata.distribution(name)  # type: ignore[attr-defined]
-												Simplify is_package check

											
										
										
											2020-05-24 15:48:56 +03:00
+								        return True
 								    except:  # noqa: E722
 								        return False
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_package_path(name: str) -> Path:
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """Get the path to an installed package.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Package name.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    RETURNS (Path): Path to installed package.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    # Here we're importing the module just to find it. This is worryingly
 								    # indirect, but it's otherwise very difficult to find the package.
-												Add tests for displaCy and util functions and fix util typo

											
										
										
											2017-05-29 11:51:19 +03:00
+								    pkg = importlib.import_module(name)
-												Fix issues for Mypy 0.950 and Pydantic 1.9.0 (#10786)

* Make changes to typing

* Correction

* Format with black

* Corrections based on review

* Bumped Thinc dependency version

* Bumped blis requirement

* Correction for older Python versions

* Update spacy/ml/models/textcat.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* Corrections based on review feedback

* Readd deleted docstring line

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
											
										
										
											2022-05-25 10:33:54 +03:00
+								    return Path(cast(Union[str, os.PathLike], pkg.__file__)).parent
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Add option to replace listeners for sourced components

											
										
										
											2021-01-29 07:57:04 +03:00
+								def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
 								    """Replace a node within a model with a new one, updating refs.
 								    model (Model): The parent model.
 								    target (Model): The target node.
 								    replacement (Model): The node to replace the target with.
 								    """
 								    # Place the node into the sublayers
 								    for node in model.walk():
 								        if target in node.layers:
 								            node.layers[node.layers.index(target)] = replacement
 								    # Now fix any node references
 								    for node in model.walk():
 								        for ref_name in node.ref_names:
 								            if node.maybe_get_ref(ref_name) is target:
 								                node.set_ref(ref_name, replacement)
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								def split_command(command: str) -> List[str]:
 								    """Split a string command using shlex. Handles platform compatibility.
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    command (str) : The command to split
 								    RETURNS (List[str]): The split command.
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    """
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    return shlex.split(command, posix=not is_windows)
-												Refactor project CLI (#5732)

* Make project command a submodule

* Update with WIP

* Add helper for joining commands

* Update docstrins, formatting and types

* Update assets and add support for copying local files

* Fix type

* Update success messages
											
										
										
											2020-07-09 02:42:51 +03:00
+								def join_command(command: List[str]) -> str:
 								    """Join a command using shlex. shlex.join is only available for Python 3.8+,
 								    so we're using a workaround here.
 								    command (List[str]): The command to join.
 								    RETURNS (str): The joined command
 								    """
 								    return " ".join(shlex.quote(cmd) for cmd in command)
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								def run_command(
 								    command: Union[str, List[str]],
 								    *,
 								    stdin: Optional[Any] = None,
-												Format

											
										
										
											2020-09-20 17:31:48 +03:00
+								    capture: bool = False,
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								) -> subprocess.CompletedProcess:
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    """Run a command on the command line as a subprocess. If the subprocess
 								    returns a non-zero exit code, a system exit is performed.
 								    command (str / List[str]): The command. If provided as a string, the
 								        string will be split using shlex.split.
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								    stdin (Optional[Any]): stdin to read from or None.
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								    capture (bool): Whether to capture the output and errors. If False,
 								        the stdout and stderr will not be redirected, and if there's an error,
-												Add capture argument to project_run (#6878)

* add capture argument to project_run and run_commands

* git bump to 3.0.1

* Set version to 3.0.1.dev0

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2021-02-02 05:11:15 +03:00
+								        sys.exit will be called with the return code. You should use capture=False
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								        when you want to turn over execution to the command, and capture=True
 								        when you want to run the command more like a function.
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								    RETURNS (Optional[CompletedProcess]): The process object.
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    """
 								    if isinstance(command, str):
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								        cmd_list = split_command(command)
 								        cmd_str = command
 								    else:
 								        cmd_list = command
 								        cmd_str = " ".join(command)
-												add custom warning when run_command fails

											
										
										
											2020-06-30 18:28:43 +03:00
+								    try:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        ret = subprocess.run(
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            cmd_list,
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								            env=os.environ.copy(),
 								            input=stdin,
-												Fix run_command for python 3.6

											
										
										
											2020-08-26 06:02:43 +03:00
+								            encoding="utf8",
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            check=False,
-												Make run_command backwards compatible

											
										
										
											2020-08-26 05:33:42 +03:00
+								            stdout=subprocess.PIPE if capture else None,
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            stderr=subprocess.STDOUT if capture else None,
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        )
-												add custom warning when run_command fails

											
										
										
											2020-06-30 18:28:43 +03:00
+								    except FileNotFoundError:
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								        # Indicates the *command* wasn't found, it's an error before the command
 								        # is run.
-												add custom warning when run_command fails

											
										
										
											2020-06-30 18:28:43 +03:00
+								        raise FileNotFoundError(
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								        ) from None
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								    if ret.returncode != 0 and capture:
 								        message = f"Error running command:\n\n{cmd_str}\n\n"
 								        message += f"Subprocess exited with status {ret.returncode}"
 								        if ret.stdout is not None:
 								            message += f"\n\nProcess log (stdout and stderr):\n\n"
 								            message += ret.stdout
 								        error = subprocess.SubprocessError(message)
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        error.ret = ret  # type: ignore[attr-defined]
 								        error.command = cmd_str  # type: ignore[attr-defined]
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								        raise error
 								    elif ret.returncode != 0:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        sys.exit(ret.returncode)
 								    return ret
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
 								@contextmanager
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								def working_dir(path: Union[str, Path]) -> Iterator[Path]:
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    """Change current working directory and returns to previous on exit.
 								    path (str / Path): The directory to navigate to.
-												Make working_dir yield absolute cwd path

											
										
										
											2020-06-30 14:17:14 +03:00
+								    YIELDS (Path): The absolute path to the current working directory. This
 								        should be used if the block needs to perform actions within the working
 								        directory, to prevent mismatches with relative paths.
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    """
 								    prev_cwd = Path.cwd()
-												Resolve within working_dir context manager

											
										
										
											2020-06-30 14:29:45 +03:00
+								    current = Path(path).resolve()
 								    os.chdir(str(current))
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    try:
-												Resolve within working_dir context manager

											
										
										
											2020-06-30 14:29:45 +03:00
+								        yield current
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    finally:
-												Resolve within working_dir context manager

											
										
										
											2020-06-30 14:29:45 +03:00
+								        os.chdir(str(prev_cwd))
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
+								@contextmanager
-												Allow spacy project to push and pull to/from remote storage (#5949)

* Add utils for working with remote storage

* WIP add remote_cache for project

* WIP add push and pull commands

* Use pathy in remote_cache

* Updarte util

* Update remote_cache

* Update util

* Update project assets

* Update pull script

* Update push script

* Fix type annotation in util

* Work on remote storage

* Remove site and env hash

* Fix imports

* Fix type annotation

* Require pathy

* Require pathy

* Fix import

* Add a util to handle project variable substitution

* Import push and pull commands

* Fix pull command

* Fix push command

* Fix tarfile in remote_storage

* Improve printing

* Fiddle with status messages

* Set version to v3.0.0a9

* Draft docs for spacy project remote storages

* Update docs [ci skip]

* Use Thinc config to simplify and unify template variables

* Auto-format

* Don't import Pathy globally for now

Causes slow and annoying Google Cloud warning

* Tidy up test

* Tidy up and update tests

* Update to latest Thinc

* Update docs

* variables -> vars

* Update docs [ci skip]

* Update docs [ci skip]

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-23 19:32:09 +03:00
+								def make_tempdir() -> Generator[Path, None, None]:
-												Tidy up, document and add custom clone logic

											
										
										
											2020-06-28 16:08:35 +03:00
+								    """Execute a block in a temporary directory and remove the directory and
 								    its contents at the end of the with block.
 								    YIELDS (Path): The path of the temp directory.
 								    """
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
+								    d = Path(tempfile.mkdtemp())
 								    yield d
-												throw warning (instead of crashing) when temp dir can't be cleaned

											
										
										
											2020-06-29 19:16:39 +03:00
+								    try:
 								        shutil.rmtree(str(d))
-												print details on error msg (e.g. PermissionError on specific file)

											
										
										
											2020-06-29 19:22:33 +03:00
+								    except PermissionError as e:
 								        warnings.warn(Warnings.W091.format(dir=d, msg=e))
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
-												Refactor project CLI (#5732)

* Make project command a submodule

* Update with WIP

* Add helper for joining commands

* Update docstrins, formatting and types

* Update assets and add support for copying local files

* Fix type

* Update success messages
											
										
										
											2020-07-09 02:42:51 +03:00
+								def is_cwd(path: Union[Path, str]) -> bool:
 								    """Check whether a path is the current working directory.
 								    path (Union[Path, str]): The directory path.
 								    RETURNS (bool): Whether the path is the current working directory.
 								    """
 								    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_in_jupyter() -> bool:
-												Update spacy.util documentation

											
										
										
											2017-05-21 02:12:09 +03:00
+								    """Check if user is running spaCy from a Jupyter notebook by detecting the
 								    IPython kernel. Mainly used for the displaCy visualizer.
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    RETURNS (bool): True if in Jupyter, False if not.
 								    """
-												Small fixes to displaCy (#3076)

## Description
- [x] fix auto-detection of Jupyter notebooks (even if `jupyter=True` isn't set)
- [x] add `displacy.set_render_wrapper` method to define a custom function called around the HTML markup generated in all calls to `displacy.render` (can be used to allow custom integrations, callbacks and page formatting)
- [x] add option to customise host for web server
- [x] show warning if `displacy.serve` is called from within Jupyter notebooks
- [x] move error message to `spacy.errors.Errors`.

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-12-20 19:32:04 +03:00
+								    # https://stackoverflow.com/a/39662359/6400719
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    try:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								        shell = get_ipython().__class__.__name__  # type: ignore[name-defined]
-												Small fixes to displaCy (#3076)

## Description
- [x] fix auto-detection of Jupyter notebooks (even if `jupyter=True` isn't set)
- [x] add `displacy.set_render_wrapper` method to define a custom function called around the HTML markup generated in all calls to `displacy.render` (can be used to allow custom integrations, callbacks and page formatting)
- [x] add option to customise host for web server
- [x] show warning if `displacy.serve` is called from within Jupyter notebooks
- [x] move error message to `spacy.errors.Errors`.

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-12-20 19:32:04 +03:00
+								        if shell == "ZMQInteractiveShell":
 								            return True  # Jupyter notebook or qtconsole
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    except NameError:
-												Small fixes to displaCy (#3076)

## Description
- [x] fix auto-detection of Jupyter notebooks (even if `jupyter=True` isn't set)
- [x] add `displacy.set_render_wrapper` method to define a custom function called around the HTML markup generated in all calls to `displacy.render` (can be used to allow custom integrations, callbacks and page formatting)
- [x] add option to customise host for web server
- [x] show warning if `displacy.serve` is called from within Jupyter notebooks
- [x] move error message to `spacy.errors.Errors`.

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-12-20 19:32:04 +03:00
+								        return False  # Probably standard Python interpreter
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    return False
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def get_object_name(obj: Any) -> str:
 								    """Get a human-readable name of a Python object, e.g. a pipeline component.
 								    obj (Any): The Python object, typically a function or class.
 								    RETURNS (str): A human-readable name.
 								    """
-												Revert added_strings change (#6236)


											
										
										
											2020-10-10 19:55:07 +03:00
+								    if hasattr(obj, "name") and obj.name is not None:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        return obj.name
 								    if hasattr(obj, "__name__"):
 								        return obj.__name__
 								    if hasattr(obj, "__class__") and hasattr(obj.__class__, "__name__"):
 								        return obj.__class__.__name__
 								    return repr(obj)
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
+								def is_same_func(func1: Callable, func2: Callable) -> bool:
 								    """Approximately decide whether two functions are the same, even if their
 								    identity is different (e.g. after they have been live reloaded). Mostly
 								    used in the @Language.component and @Language.factory decorators to decide
 								    whether to raise if a factory already exists. Allows decorator to run
 								    multiple times with the same function.
 								    func1 (Callable): The first function.
 								    func2 (Callable): The second function.
 								    RETURNS (bool): Whether it's the same function (most likely).
 								    """
 								    if not callable(func1) or not callable(func2):
 								        return False
-												Update and add test

											
										
										
											2021-02-10 06:12:00 +03:00
+								    if not hasattr(func1, "__qualname__") or not hasattr(func2, "__qualname__"):
-												Several callable objects do not have __qualname__
											
										
										
											2021-02-09 08:43:02 +03:00
+								        return False
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
+								    same_name = func1.__qualname__ == func2.__qualname__
 								    same_file = inspect.getfile(func1) == inspect.getfile(func2)
 								    same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2)
-												Update spacy/util.py

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-29 14:17:06 +03:00
+								    return same_name and same_file and same_code
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_cuda_stream(
 								    require: bool = False, non_blocking: bool = True
 								) -> Optional[CudaStream]:
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								    ops = get_current_ops()
-												Revert "Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop""

This reverts commit f41e626844c62c1cf1333a99df2e027ec205a7c6.

											
										
										
											2018-03-27 20:22:52 +03:00
+								    if CudaStream is None:
 								        return None
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								    elif isinstance(ops, NumpyOps):
-												Revert "Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop""

This reverts commit f41e626844c62c1cf1333a99df2e027ec205a7c6.

											
										
										
											2018-03-27 20:22:52 +03:00
+								        return None
 								    else:
-												Support no hidden layer in parser and NER (#4672)

* Support no hidden layers for parser

* Fix parser model for depth 1

* Fix parser for hidden depth=0

* Add option of non-blocking to CUDA stream

											
										
										
											2019-11-19 17:54:34 +03:00
+								        return CudaStream(non_blocking=non_blocking)
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 22:46:08 +03:00
 								def get_async(stream, numpy_array):
 								    if cupy is None:
 								        return numpy_array
 								    else:
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        array = cupy.ndarray(numpy_array.shape, order="C", dtype=numpy_array.dtype)
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								        array.set(numpy_array, stream=stream)
 								        return array
-												Fix formatting

											
										
										
											2017-05-26 13:37:45 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def read_regex(path: Union[str, Path]) -> Pattern:
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								    path = ensure_path(path)
-												set encodings explicitly to utf8 (#4551)


											
										
										
											2019-10-29 15:16:55 +03:00
+								    with path.open(encoding="utf8") as file_:
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        entries = file_.read().split("\n")
 								    expression = "|".join(
 								        ["^" + re.escape(piece) for piece in entries if piece.strip()]
 								    )
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
-												Update docstrings [ci skip]

											
										
										
											2019-02-24 20:39:59 +03:00
+								    """Compile a sequence of prefix rules into a regex object.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    entries (Iterable[Union[str, Pattern]]): The prefix rules, e.g.
 								        spacy.lang.punctuation.TOKENIZER_PREFIXES.
 								    RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
+								    """
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    expression = "|".join(["^" + piece for piece in entries if piece.strip()])  # type: ignore[operator, union-attr]
-												Remove dead and/or deprecated code (#5710)

* Remove dead and/or deprecated code

* Remove n_threads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-06 14:06:25 +03:00
+								    return re.compile(expression)
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
-												Update docstrings [ci skip]

											
										
										
											2019-02-24 20:39:59 +03:00
+								    """Compile a sequence of suffix rules into a regex object.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    entries (Iterable[Union[str, Pattern]]): The suffix rules, e.g.
 								        spacy.lang.punctuation.TOKENIZER_SUFFIXES.
 								    RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
+								    """
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    expression = "|".join([piece + "$" for piece in entries if piece.strip()])  # type: ignore[operator, union-attr]
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
-												Update docstrings [ci skip]

											
										
										
											2019-02-24 20:39:59 +03:00
+								    """Compile a sequence of infix rules into a regex object.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    entries (Iterable[Union[str, Pattern]]): The infix rules, e.g.
 								        spacy.lang.punctuation.TOKENIZER_INFIXES.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
+								    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
 								    """
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    expression = "|".join([piece for piece in entries if piece.strip()])  # type: ignore[misc, union-attr]
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def add_lookups(default_func: Callable[[str], Any], *lookups) -> Callable[[str], Any]:
-												Add add_lookups util function

											
										
										
											2017-06-03 20:44:47 +03:00
+								    """Extend an attribute function with special cases. If a word is in the
 								    lookups, the value is returned. Otherwise the previous function is used.
 								    default_func (callable): The default function to execute.
 								    *lookups (dict): Lookup dictionary mapping string to attribute value.
 								    RETURNS (callable): Lexical attribute getter.
 								    """
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								    # This is implemented as functools.partial instead of a closure, to allow
 								    # pickle to work.
 								    return functools.partial(_get_attr_unless_lookup, default_func, lookups)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def _get_attr_unless_lookup(
 								    default_func: Callable[[str], Any], lookups: Dict[str, Any], string: str
 								) -> Any:
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								    for lookup in lookups:
 								        if string in lookup:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								            return lookup[string]  # type: ignore[index]
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								    return default_func(string)
-												Add add_lookups util function

											
										
										
											2017-06-03 20:44:47 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def update_exc(
 								    base_exceptions: Dict[str, List[dict]], *addition_dicts
 								) -> Dict[str, List[dict]]:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Update and validate tokenizer exceptions. Will overwrite exceptions.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    base_exceptions (Dict[str, List[dict]]): Base exceptions.
 								    *addition_dicts (Dict[str, List[dict]]): Exceptions to add to the base dict, in order.
 								    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    exc = dict(base_exceptions)
 								    for additions in addition_dicts:
 								        for orth, token_attrs in additions.items():
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								            if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            described_orth = "".join(attr[ORTH] for attr in token_attrs)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								            if orth != described_orth:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								        exc.update(additions)
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								    exc = expand_exc(exc, "'", "’")
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    return exc
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def expand_exc(
 								    excs: Dict[str, List[dict]], search: str, replace: str
 								) -> Dict[str, List[dict]]:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Find string in tokenizer exceptions, duplicate entry and replace string.
 								    For example, to add additional versions with typographic apostrophes.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    excs (Dict[str, List[dict]]): Tokenizer exceptions.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    search (str): String to find and replace.
 								    replace (str): Replacement.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    def _fix_token(token, search, replace):
 								        fixed = dict(token)
 								        fixed[ORTH] = fixed[ORTH].replace(search, replace)
 								        return fixed
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								    new_excs = dict(excs)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    for token_string, tokens in excs.items():
 								        if search in token_string:
 								            new_key = token_string.replace(search, replace)
 								            new_value = [_fix_token(t, search, replace) for t in tokens]
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								            new_excs[new_key] = new_value
 								    return new_excs
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def normalize_slice(
 								    length: int, start: int, stop: int, step: Optional[int] = None
 								) -> Tuple[int, int]:
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    if not (step is None or step == 1):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        raise ValueError(Errors.E057)
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    if start is None:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        start = 0
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    elif start < 0:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        start += length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    start = min(length, max(0, start))
 								    if stop is None:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        stop = length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    elif stop < 0:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        stop += length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    stop = min(length, max(start, stop))
 								    return start, stop
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
 								    creating named entities (where one token can only be part of one entity) or
 								    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
 								    longest span is preferred over shorter spans.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    spans (Iterable[Span]): The spans to filter.
 								    RETURNS (List[Span]): The filtered spans.
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    """
-												Fix util.filter_spans() to prefer first span in overlapping sam… (#4414)

* Update util.filter_spans() to prefer earlier spans

* Add filter_spans test for first same-length span

* Update entity relation example to refer to util.filter_spans()

											
										
										
											2019-10-10 18:00:03 +03:00
+								    get_sort_key = lambda span: (span.end - span.start, -span.start)
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
 								    result = []
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    seen_tokens: Set[int] = set()
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    for span in sorted_spans:
 								        # Check for end - 1 here because boundaries are inclusive
 								        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
 								            result.append(span)
-												Fix Issue 6207 (#6208)

* Regression test for issue 6207

* Fix issue 6207

* Sign contributor agreement

* Minor adjustments to test

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

											
										
										
											2020-10-06 12:17:37 +03:00
+								            seen_tokens.update(range(span.start, span.end))
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    result = sorted(result, key=lambda span: span.start)
 								    return result
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def to_bytes(getters: Dict[str, Callable[[], bytes]], exclude: Iterable[str]) -> bytes:
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								    return srsly.msgpack_dumps(to_dict(getters, exclude))
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def from_bytes(
 								    bytes_data: bytes,
 								    setters: Dict[str, Callable[[bytes], Any]],
 								    exclude: Iterable[str],
 								) -> None:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)  # type: ignore[return-value]
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def to_dict(
 								    getters: Dict[str, Callable[[], Any]], exclude: Iterable[str]
 								) -> Dict[str, Any]:
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    serialized = {}
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								    for key, getter in getters.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								            serialized[key] = getter()
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								    return serialized
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def from_dict(
 								    msg: Dict[str, Any],
 								    setters: Dict[str, Callable[[Any], Any]],
 								    exclude: Iterable[str],
 								) -> Dict[str, Any]:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								    for key, setter in setters.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude and key in msg:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								            setter(msg[key])
 								    return msg
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def to_disk(
 								    path: Union[str, Path],
 								    writers: Dict[str, Callable[[Path], None]],
 								    exclude: Iterable[str],
 								) -> Path:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    path = ensure_path(path)
 								    if not path.exists():
 								        path.mkdir()
 								    for key, writer in writers.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								            writer(path / key)
 								    return path
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def from_disk(
 								    path: Union[str, Path],
 								    readers: Dict[str, Callable[[Path], None]],
 								    exclude: Iterable[str],
 								) -> Path:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    path = ensure_path(path)
 								    for key, reader in readers.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude:
-												Fix deserialization of vectors

											
										
										
											2017-10-16 21:55:00 +03:00
+								            reader(path / key)
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    return path
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    """Import module from a file. Used to load models from a directory.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Name of module to load.
 								    loc (str / Path): Path to the file.
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    RETURNS: The loaded module.
 								    """
-												Save one line of code

											
										
										
											2020-10-03 12:41:28 +03:00
+								    spec = importlib.util.spec_from_file_location(name, str(loc))
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    module = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
 								    spec.loader.exec_module(module)  # type: ignore[union-attr]
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    return module
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def minify_html(html: str) -> str:
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
+								    """Perform a template-specific, rudimentary HTML minification for displaCy.
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    Disclaimer: NOT a general-purpose solution, only removes indentation and
 								    newlines.
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    html (str): Markup to minify.
 								    RETURNS (str): "Minified" HTML.
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    return html.strip().replace("    ", "").replace("\n", "")
-												Add util function to enable GPU

											
										
										
											2017-09-21 03:16:35 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def escape_html(text: str) -> str:
-												escape html in displacy.render (#2378) (closes #2361)

## Description
Fix for issue #2361 :
replace &, <, >, " with &amp;amp; , &amp;lt; , &amp;gt; , &amp;quot; in before rendering svg

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
(As discussed in the comments to #2361)
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-05-28 19:36:41 +03:00
+								    """Replace <, >, &, " with their HTML encoded representation. Intended to
 								    prevent HTML errors in rendered displaCy markup.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    text (str): The original text.
 								    RETURNS (str): Equivalent text to be safely used within HTML.
-												escape html in displacy.render (#2378) (closes #2361)

## Description
Fix for issue #2361 :
replace &, <, >, " with &amp;amp; , &amp;lt; , &amp;gt; , &amp;quot; in before rendering svg

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
(As discussed in the comments to #2361)
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-05-28 19:36:41 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    text = text.replace("&", "&amp;")
 								    text = text.replace("<", "&lt;")
 								    text = text.replace(">", "&gt;")
 								    text = text.replace('"', "&quot;")
-												escape html in displacy.render (#2378) (closes #2361)

## Description
Fix for issue #2361 :
replace &, <, >, " with &amp;amp; , &amp;lt; , &amp;gt; , &amp;quot; in before rendering svg

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
(As discussed in the comments to #2361)
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-05-28 19:36:41 +03:00
+								    return text
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_words_and_spaces(
 								    words: Iterable[str], text: str
 								) -> Tuple[List[str], List[bool]]:
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								    """Given a list of words and a text, reconstruct the original tokens and
 								    return a list of words and spaces that can be used to create a Doc. This
 								    can help recover destructive tokenization that didn't preserve any
 								    whitespace information.
 								    words (Iterable[str]): The words.
 								    text (str): The original text.
 								    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
 								    """
-												Improve GoldParse NER alignment (#5335)

Improve GoldParse NER alignment by including all cases where the start
and end of the NER span can be aligned, regardless of internal
tokenization differences.

To do this, convert BILUO tags to character offsets, check start/end
alignment with `doc.char_span()`, and assign the BILUO tags for the
aligned spans. Alignment for `O/-` tags is handled through the
one-to-one and multi alignments.
											
										
										
											2020-04-23 17:58:23 +03:00
+								    if "".join("".join(words).split()) != "".join(text.split()):
-												Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
											
										
										
											2020-04-14 20:15:52 +03:00
+								        raise ValueError(Errors.E194.format(text=text, words=words))
 								    text_words = []
 								    text_spaces = []
 								    text_pos = 0
 								    # normalize words to remove all whitespace tokens
 								    norm_words = [word for word in words if not word.isspace()]
 								    # align words with text
 								    for word in norm_words:
 								        try:
 								            word_start = text[text_pos:].index(word)
 								        except ValueError:
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								            raise ValueError(Errors.E194.format(text=text, words=words)) from None
-												Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
											
										
										
											2020-04-14 20:15:52 +03:00
+								        if word_start > 0:
-												Tidy up and auto-format

											
										
										
											2020-05-21 15:14:01 +03:00
+								            text_words.append(text[text_pos : text_pos + word_start])
-												Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
											
										
										
											2020-04-14 20:15:52 +03:00
+								            text_spaces.append(False)
 								            text_pos += word_start
 								        text_words.append(word)
 								        text_spaces.append(False)
 								        text_pos += len(word)
 								        if text_pos < len(text) and text[text_pos] == " ":
 								            text_spaces[-1] = True
 								            text_pos += 1
 								    if text_pos < len(text):
 								        text_words.append(text[text_pos:])
 								        text_spaces.append(False)
 								    return (text_words, text_spaces)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
 								    """Deep copy a Config. Will raise an error if the config contents are not
 								    JSON-serializable.
 								    config (Config): The config to copy.
 								    RETURNS (Config): The copied config.
-												Add SimpleFrozenDict util to use as default function argument

											
										
										
											2018-05-20 16:13:37 +03:00
+								    """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    try:
 								        return Config(config).copy()
 								    except ValueError:
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								        raise ValueError(Errors.E961.format(config=config)) from None
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Add SimpleFrozenDict util to use as default function argument

											
										
										
											2018-05-20 16:13:37 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
 								    """Convert dot notation to a dict. For example: {"token.pos": True,
 								    "token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
 								    values (Dict[str, Any]): The key/value pairs to convert.
 								    RETURNS (Dict[str, dict]): The converted values.
 								    """
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    result: Dict[str, dict] = {}
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    for key, value in values.items():
 								        path = result
 								        parts = key.lower().split(".")
 								        for i, item in enumerate(parts):
 								            is_last = i == len(parts) - 1
 								            path = path.setdefault(item, value if is_last else {})
 								    return result
 								def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
 								    """Convert dot notation to a dict. For example: {"token": {"pos": True,
 								    "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
 								    values (Dict[str, dict]): The dict to convert.
 								    RETURNS (Dict[str, Any]): The key/value pairs.
 								    """
 								    return {".".join(key): value for key, value in walk_dict(obj)}
-												util function dot_to_object and corresponding unit test

											
										
										
											2020-07-27 18:50:12 +03:00
+								def dot_to_object(config: Config, section: str):
 								    """Convert dot notation of a "section" to a specific part of the Config.
 								    e.g. "training.optimizer" would return the Optimizer object.
 								    Throws an error if the section is not defined in this config.
 								    config (Config): The config.
 								    section (str): The dot notation of the section in the config.
 								    RETURNS: The object denoted by the section
 								    """
 								    component = config
 								    parts = section.split(".")
 								    for item in parts:
 								        try:
 								            component = component[item]
-												Move error to Errors

											
										
										
											2020-07-28 17:24:14 +03:00
+								        except (KeyError, TypeError):
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								            raise KeyError(Errors.E952.format(name=section)) from None
-												util function dot_to_object and corresponding unit test

											
										
										
											2020-07-27 18:50:12 +03:00
+								    return component
-												Add option to replace listeners for sourced components

											
										
										
											2021-01-29 07:57:04 +03:00
+								def set_dot_to_object(config: Config, section: str, value: Any) -> None:
 								    """Update a config at a given position from a dot notation.
 								    config (Config): The config.
 								    section (str): The dot notation of the section in the config.
 								    value (Any): The value to set in the config.
 								    """
 								    component = config
 								    parts = section.split(".")
 								    for i, item in enumerate(parts):
 								        try:
 								            if i == len(parts) - 1:
 								                component[item] = value
 								            else:
 								                component = component[item]
 								        except (KeyError, TypeError):
 								            raise KeyError(Errors.E952.format(name=section)) from None
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def walk_dict(
 								    node: Dict[str, Any], parent: List[str] = []
 								) -> Iterator[Tuple[List[str], Any]]:
 								    """Walk a dict and yield the path and values of the leaves."""
 								    for key, value in node.items():
 								        key_parent = [*parent, key]
 								        if isinstance(value, dict):
 								            yield from walk_dict(value, key_parent)
 								        else:
 								            yield (key_parent, value)
 								def get_arg_names(func: Callable) -> List[str]:
 								    """Get a list of all named arguments of a function (regular,
 								    keyword-only).
 								    func (Callable): The function
 								    RETURNS (List[str]): The argument names.
 								    """
 								    argspec = inspect.getfullargspec(func)
-												Fix inconsistent lemmas (#9405)

* Add util function to unique lists and preserve order

* Use unique function instead of list(set())

list(set()) has the issue that it's not consistent between runs of the
Python interpreter, so order can vary.

list(set()) calls were left in a few places where they were behind calls
to sorted(). I think in this case the calls to list() can be removed,
but this commit doesn't do that.

* Use the existing pattern for this
											
										
										
											2021-10-11 12:38:45 +03:00
+								    return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
-												Fix combined scores and update test

											
										
										
											2020-09-24 11:42:47 +03:00
+								def combine_score_weights(
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    weights: List[Dict[str, Optional[float]]],
 								    overrides: Dict[str, Optional[float]] = SimpleFrozenDict(),
 								) -> Dict[str, Optional[float]]:
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    """Combine and normalize score weights defined by components, e.g.
 								    {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
 								    weights (List[dict]): The weights defined by the components.
-												Fix combined scores and update test

											
										
										
											2020-09-24 11:42:47 +03:00
+								    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
 								        should be preserved.
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    RETURNS (Dict[str, float]): The combined and normalized weights.
 								    """
-												Fix scoring normalization (#7629)

* fix scoring normalization

* score weights by total sum instead of per component

* cleanup

* more cleanup
											
										
										
											2021-04-26 17:53:38 +03:00
+								    # We divide each weight by the total weight sum.
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								    # We first need to extract all None/null values for score weights that
 								    # shouldn't be shown in the table *or* be weighted
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    result: Dict[str, Optional[float]] = {
 								        key: value for w_dict in weights for (key, value) in w_dict.items()
 								    }
-												Add SpanCategorizer component (#6747)

* Draft spancat model

* Add spancat model

* Add test for extract_spans

* Add extract_spans layer

* Upd extract_spans

* Add spancat model

* Add test for spancat model

* Upd spancat model

* Update spancat component

* Upd spancat

* Update spancat model

* Add quick spancat test

* Import SpanCategorizer

* Fix SpanCategorizer component

* Import SpanGroup

* Fix span extraction

* Fix import

* Fix import

* Upd model

* Update spancat models

* Add scoring, update defaults

* Update and add docs

* Fix type

* Update spacy/ml/extract_spans.py

* Auto-format and fix import

* Fix comment

* Fix type

* Fix type

* Update website/docs/api/spancategorizer.md

* Fix comment

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Better defense

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix labels list

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/extract_spans.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/pipeline/spancat.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Set annotations during update

* Set annotations in spancat

* fix imports in test

* Update spacy/pipeline/spancat.py

* replace MaxoutLogistic with LinearLogistic

* fix config

* various small fixes

* remove set_annotations parameter in update

* use our beloved tupley format with recent support for doc.spans

* bugfix to allow renaming the default span_key (scores weren't showing up)

* use different key in docs example

* change defaults to better-working parameters from project (WIP)

* register spacy.extract_spans.v1 for legacy purposes

* Upd dev version so can build wheel

* layers instead of architectures for smaller building blocks

* Update website/docs/api/spancategorizer.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update website/docs/api/spancategorizer.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Include additional scores from overrides in combined score weights

* Parameterize spans key in scoring

Parameterize the `SpanCategorizer` `spans_key` for scoring purposes so
that it's possible to evaluate multiple `spancat` components in the same
pipeline.

* Use the (intentionally very short) default spans key `sc` in the
  `SpanCategorizer`
* Adjust the default score weights to include the default key
* Adjust the scorer to use `spans_{spans_key}` as the prefix for the
  returned score
* Revert addition of `attr_name` argument to `score_spans` and adjust
  the key in the `getter` instead.

Note that for `spancat` components with a custom `span_key`, the score
weights currently need to be modified manually in
`[training.score_weights]` for them to be available during training. To
suppress the default score weights `spans_sc_p/r/f` during training, set
them to `null` in `[training.score_weights]`.

* Update website/docs/api/scorer.md

* Fix scorer for spans key containing underscore

* Increment version

* Add Spans to Evaluate CLI (#8439)

* Add Spans to Evaluate CLI

* Change to spans_key

* Add spans per_type output

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix spancat GPU issues (#8455)

* Fix GPU issues

* Require thinc >=8.0.6

* Switch to glorot_uniform_init

* Fix and test ngram suggester

* Include final ngram in doc for all sizes
* Fix ngrams for docs of the same length as ngram size
* Handle batches of docs that result in no ngrams
* Add tests

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Nirant <NirantK@users.noreply.github.com>
											
										
										
											2021-06-24 13:35:27 +03:00
+								    result.update(overrides)
-												Fix scoring normalization (#7629)

* fix scoring normalization

* score weights by total sum instead of per component

* cleanup

* more cleanup
											
										
										
											2021-04-26 17:53:38 +03:00
+								    weight_sum = sum([v if v else 0.0 for v in result.values()])
 								    for key, value in result.items():
 								        if value and weight_sum > 0:
 								            result[key] = round(value / weight_sum, 2)
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    return result
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class DummyTokenizer:
-												JapaneseTokenizer.pipe added (#6515)

* JapaneseTokenizer.pipe added

For [spacymoji](https://spacy.io/universe/project/spacymoji)  with `Japanese()`.

* DummyTokenizer.pipe added instead
											
										
										
											2020-12-08 22:02:23 +03:00
+								    def __call__(self, text):
 								        raise NotImplementedError
 								    def pipe(self, texts, **kwargs):
 								        for text in texts:
 								            yield self(text)
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
 								    # allow serialization (see #1557)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_bytes(self, **kwargs):
-												Auto-format

											
										
										
											2019-02-07 23:00:04 +03:00
+								        return b""
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    def from_bytes(self, data: bytes, **kwargs) -> "DummyTokenizer":
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								        return self
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								        return None
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    def from_disk(self, path: Union[str, Path], **kwargs) -> "DummyTokenizer":
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								        return self
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def create_default_optimizer() -> Optimizer:
-												Remove env_opt and simplfy default Optimizer

											
										
										
											2020-08-14 15:59:54 +03:00
+								    return Adam()
-												Create corpus iterator and batcher from registry during training (#5865)

* Move batchers into their own module (and registry)

* Update CLI

* Update Corpus and batcher

* Update tests

* Update one config

* Merge 'evaluation' block back under [training]

* Import batchers in gold __init__

* Fix batchers

* Update config

* Update schema

* Update util

* Don't assume train and dev are actually paths

* Update onto-joint config

* Fix missing import

* Format

* Format

* Update spacy/gold/corpus.py

Co-authored-by: Ines Montani <ines@ines.io>

* Fix name

* Update default config

* Fix get_length option in batchers

* Update test

* Add comment

* Pass path into Corpus

* Update docstring

* Update schema and configs

* Update config

* Fix test

* Fix paths

* Fix print

* Fix create_train_batches

* [training.read_train] -> [training.train_corpus]

* Update onto-joint config

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-04 16:09:37 +03:00
 								def minibatch(items, size):
 								    """Iterate over batches of items. `size` may be an iterator,
 								    so that batch-size can vary on each step.
 								    """
 								    if isinstance(size, int):
 								        size_ = itertools.repeat(size)
 								    else:
 								        size_ = size
 								    items = iter(items)
 								    while True:
 								        batch_size = next(size_)
 								        batch = list(itertools.islice(items, int(batch_size)))
 								        if len(batch) == 0:
 								            break
 								        yield list(batch)
-												Handle None default args in Cython methods

											
										
										
											2020-09-29 19:08:02 +03:00
 								def is_cython_func(func: Callable) -> bool:
 								    """Slightly hacky check for whether a callable is implemented in Cython.
 								    Can be used to implement slightly different behaviors, especially around
-												Use __pyx_vtable__ instead of __reduce_cython__

											
										
										
											2020-09-29 23:04:17 +03:00
+								    inspecting and parameter annotations. Note that this will only return True
 								    for actual cdef functions and methods, not regular Python functions defined
 								    in Python modules.
-												Handle None default args in Cython methods

											
										
										
											2020-09-29 19:08:02 +03:00
 								    func (Callable): The callable to check.
 								    RETURNS (bool): Whether the callable is Cython (probably).
 								    """
-												Use __pyx_vtable__ instead of __reduce_cython__

											
										
										
											2020-09-29 23:04:17 +03:00
+								    attr = "__pyx_vtable__"
-												Handle None default args in Cython methods

											
										
										
											2020-09-29 19:08:02 +03:00
+								    if hasattr(func, attr):  # function or class instance
 								        return True
 								    # https://stackoverflow.com/a/55767059
-												Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.

											
										
										
											2021-03-19 12:45:16 +03:00
+								    if (
 								        hasattr(func, "__qualname__")
 								        and hasattr(func, "__module__")
 								        and func.__module__ in sys.modules
 								    ):  # method
 								        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
 								        return hasattr(cls_func, attr)
-												Handle None default args in Cython methods

											
										
										
											2020-09-29 19:08:02 +03:00
+								    return False
-												Enable commit check via env var

											
										
										
											2020-10-05 21:51:15 +03:00
 								def check_bool_env_var(env_var: str) -> bool:
 								    """Convert the value of an environment variable to a boolean. Add special
 								    check for "0" (falsy) and consider everything else truthy, except unset.
 								    env_var (str): The name of the environment variable to check.
 								    RETURNS (bool): Its boolean value.
 								    """
 								    value = os.environ.get(env_var, False)
 								    if value == "0":
 								        return False
 								    return bool(value)
-												TrainablePipe (#6213)

* rename Pipe to TrainablePipe

* split functionality between Pipe and TrainablePipe

* remove unnecessary methods from certain components

* cleanup

* hasattr(component, "pipe") should be sufficient again

* remove serialization and vocab/cfg from Pipe

* unify _ensure_examples and validate_examples

* small fixes

* hasattr checks for self.cfg and self.vocab

* make is_resizable and is_trainable properties

* serialize strings.json instead of vocab

* fix KB IO + tests

* fix typos

* more typos

* _added_strings as a set

* few more tests specifically for _added_strings field

* bump to 3.0.0a36
											
										
										
											2020-10-08 22:33:49 +03:00
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								def _pipe(
 								    docs: Iterable["Doc"],
 								    proc: "Pipe",
 								    name: str,
 								    default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn],
 								    kwargs: Mapping[str, Any],
 								) -> Iterator["Doc"]:
-												TrainablePipe (#6213)

* rename Pipe to TrainablePipe

* split functionality between Pipe and TrainablePipe

* remove unnecessary methods from certain components

* cleanup

* hasattr(component, "pipe") should be sufficient again

* remove serialization and vocab/cfg from Pipe

* unify _ensure_examples and validate_examples

* small fixes

* hasattr checks for self.cfg and self.vocab

* make is_resizable and is_trainable properties

* serialize strings.json instead of vocab

* fix KB IO + tests

* fix typos

* more typos

* _added_strings as a set

* few more tests specifically for _added_strings field

* bump to 3.0.0a36
											
										
										
											2020-10-08 22:33:49 +03:00
+								    if hasattr(proc, "pipe"):
 								        yield from proc.pipe(docs, **kwargs)
-												if-else

											
										
										
											2020-10-13 10:27:19 +03:00
+								    else:
 								        # We added some args for pipe that __call__ doesn't expect.
 								        kwargs = dict(kwargs)
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								        error_handler = default_error_handler
 								        if hasattr(proc, "get_error_handler"):
 								            error_handler = proc.get_error_handler()
-												if-else

											
										
										
											2020-10-13 10:27:19 +03:00
+								        for arg in ["batch_size"]:
 								            if arg in kwargs:
 								                kwargs.pop(arg)
 								        for doc in docs:
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								            try:
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								                doc = proc(doc, **kwargs)  # type: ignore[call-arg]
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								                yield doc
 								            except Exception as e:
 								                error_handler(name, proc, [doc], e)
 								def raise_error(proc_name, proc, docs, e):
 								    raise e
-												Move replacement logic to Language.from_config

											
										
										
											2021-01-29 11:37:04 +03:00
-												Error handling in nlp.pipe (#6817)

* add error handler for pipe methods

* add unit tests

* remove pipe method that are the same as their base class

* have Language keep track of a default error handler

* cleanup

* formatting

* small refactor

* add documentation
											
										
										
											2021-01-29 03:51:21 +03:00
+								def ignore_error(proc_name, proc, docs, e):
 								    pass
-												Add warning about GPU selection in Jupyter notebooks (#7075)

* Initial warning

* Update check

* Redo edit

* Move jupyter warning to helper method

* Add link with details to warnings
											
										
										
											2021-03-09 17:35:21 +03:00
 								def warn_if_jupyter_cupy():
 								    """Warn about require_gpu if a jupyter notebook + cupy + mismatched
 								    contextvars vs. thread ops are detected
 								    """
 								    if is_in_jupyter():
 								        from thinc.backends.cupy_ops import CupyOps
-												Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.

											
										
										
											2021-03-19 12:45:16 +03:00
-												Add warning about GPU selection in Jupyter notebooks (#7075)

* Initial warning

* Update check

* Redo edit

* Move jupyter warning to helper method

* Add link with details to warnings
											
										
										
											2021-03-09 17:35:21 +03:00
+								        if CupyOps.xp is not None:
 								            from thinc.backends import contextvars_eq_thread_ops
-												Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.

											
										
										
											2021-03-19 12:45:16 +03:00
-												Add warning about GPU selection in Jupyter notebooks (#7075)

* Initial warning

* Update check

* Redo edit

* Move jupyter warning to helper method

* Add link with details to warnings
											
										
										
											2021-03-09 17:35:21 +03:00
+								            if not contextvars_eq_thread_ops():
 								                warnings.warn(Warnings.W111)
-												Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.

											
										
										
											2021-03-19 12:45:16 +03:00
 								def check_lexeme_norms(vocab, component_name):
 								    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
 								    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
 								        langs = ", ".join(LEXEME_NORM_LANGS)
 								        logger.debug(Warnings.W033.format(model=component_name, langs=langs))
-												Update sent_starts in Example.from_dict (#7847)

* Update sent_starts in Example.from_dict

Update `sent_starts` for `Example.from_dict` so that `Optional[bool]`
values have the same meaning as for `Token.is_sent_start`.

Use `Optional[bool]` as the type for sent start values in the docs.

* Use helper function for conversion to ternary ints
											
										
										
											2021-04-22 12:32:45 +03:00
 								def to_ternary_int(val) -> int:
 								    """Convert a value to the ternary 1/0/-1 int used for True/None/False in
 								    attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
 								    (None), any other values are -1 (False).
 								    """
-												Refactor util.to_ternary_int (#7944)

* Refactor to avoid literal comparison with `is`
* Extend tests
											
										
										
											2021-04-29 17:58:54 +03:00
+								    if val is True:
-												Update sent_starts in Example.from_dict (#7847)

* Update sent_starts in Example.from_dict

Update `sent_starts` for `Example.from_dict` so that `Optional[bool]`
values have the same meaning as for `Token.is_sent_start`.

Use `Optional[bool]` as the type for sent start values in the docs.

* Use helper function for conversion to ternary ints
											
										
										
											2021-04-22 12:32:45 +03:00
+								        return 1
-												Refactor util.to_ternary_int (#7944)

* Refactor to avoid literal comparison with `is`
* Extend tests
											
										
										
											2021-04-29 17:58:54 +03:00
+								    elif val is None:
 								        return 0
 								    elif val is False:
 								        return -1
 								    elif val == 1:
 								        return 1
 								    elif val == 0:
-												Update sent_starts in Example.from_dict (#7847)

* Update sent_starts in Example.from_dict

Update `sent_starts` for `Example.from_dict` so that `Optional[bool]`
values have the same meaning as for `Token.is_sent_start`.

Use `Optional[bool]` as the type for sent start values in the docs.

* Use helper function for conversion to ternary ints
											
										
										
											2021-04-22 12:32:45 +03:00
+								        return 0
 								    else:
 								        return -1
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
 								# The following implementation of packages_distributions() is adapted from
 								# importlib_metadata, which is distributed under the Apache 2.0 License.
 								# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
 								# See licenses/3rd_party_licenses.txt
 								def packages_distributions() -> Dict[str, List[str]]:
 								    """Return a mapping of top-level packages to their distributions. We're
 								    inlining this helper from the importlib_metadata "backport" here, since
 								    it's not available in the builtin importlib.metadata.
 								    """
 								    pkg_to_dist = defaultdict(list)
-												Fix issues for Mypy 0.950 and Pydantic 1.9.0 (#10786)

* Make changes to typing

* Correction

* Format with black

* Corrections based on review

* Bumped Thinc dependency version

* Bumped blis requirement

* Correction for older Python versions

* Update spacy/ml/models/textcat.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* Corrections based on review feedback

* Readd deleted docstring line

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
											
										
										
											2022-05-25 10:33:54 +03:00
+								    for dist in importlib_metadata.distributions():
-												Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
											
										
										
											2021-08-17 15:05:13 +03:00
+								        for pkg in (dist.read_text("top_level.txt") or "").split():
 								            pkg_to_dist[pkg].append(dist.metadata["Name"])
 								    return dict(pkg_to_dist)