spaCy/spacy/util.py

from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
from types import ModuleType
import os
import importlib
import importlib.util
import re
from pathlib import Path
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
import functools
import itertools
import numpy.random
import numpy
import srsly
import catalogue
import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
import subprocess
from contextlib import contextmanager
import tempfile
import shutil
import shlex
import inspect
import logging

try:
    import cupy.random
except ImportError:
    cupy = None

try:  # Python 3.8
    import importlib.metadata as importlib_metadata
except ImportError:
    import importlib_metadata

# These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
# not from spacy.util.
from thinc.api import fix_random_seed, compounding, decaying  # noqa: F401


from .symbols import ORTH
from .compat import cupy, CudaStream, is_windows
from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about

if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
    from .tokens import Doc, Span  # noqa: F401
    from .vocab import Vocab  # noqa: F401


OOV_RANK = numpy.iinfo(numpy.uint64).max
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]

# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
# fmt: off
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
# fmt: on


logging.basicConfig()
logger = logging.getLogger("spacy")


class registry(thinc.registry):
    languages = catalogue.create("spacy", "languages", entry_points=True)
    architectures = catalogue.create("spacy", "architectures", entry_points=True)
    tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True)
    lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
    lookups = catalogue.create("spacy", "lookups", entry_points=True)
    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
    misc = catalogue.create("spacy", "misc", entry_points=True)
    # Callback functions used to manipulate nlp object etc.
    callbacks = catalogue.create("spacy", "callbacks")
    batchers = catalogue.create("spacy", "batchers", entry_points=True)
    readers = catalogue.create("spacy", "readers", entry_points=True)
    loggers = catalogue.create("spacy", "loggers", entry_points=True)
    # These are factories registered via third-party packages and the
    # spacy_factories entry point. This registry only exists so we can easily
    # load them via the entry points. The "true" factories are added via the
    # Language.factory decorator (in the spaCy code base and user code) and those
    # are the factories used to initialize components via registry.resolve.
    _entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
    factories = catalogue.create("spacy", "internal_factories")
    # This is mostly used to get a list of all installed models in the current
    # environment. spaCy models packaged with `spacy package` will "advertise"
    # themselves via entry points.
    models = catalogue.create("spacy", "models", entry_points=True)
    cli = catalogue.create("spacy", "cli", entry_points=True)


class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty
    dictionary). Will raise an error if user or spaCy attempts to add to dict.
    """

    def __init__(self, *args, error: str = Errors.E095, **kwargs) -> None:
        """Initialize the frozen dict. Can be initialized with pre-defined
        values.

        error (str): The error message when user tries to assign to dict.
        """
        super().__init__(*args, **kwargs)
        self.error = error

    def __setitem__(self, key, value):
        raise NotImplementedError(self.error)

    def pop(self, key, default=None):
        raise NotImplementedError(self.error)

    def update(self, other):
        raise NotImplementedError(self.error)


class SimpleFrozenList(list):
    """Wrapper class around a list that lets us raise custom errors if certain
    attributes/methods are accessed. Mostly used for properties like
    Language.pipeline that return an immutable list (and that we don't want to
    convert to a tuple to not break too much backwards compatibility). If a user
    accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
    """

    def __init__(self, *args, error: str = Errors.E927) -> None:
        """Initialize the frozen list.

        error (str): The error message when user tries to mutate the list.
        """
        self.error = error
        super().__init__(*args)

    def append(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def clear(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def extend(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def insert(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def pop(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def remove(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def reverse(self, *args, **kwargs):
        raise NotImplementedError(self.error)

    def sort(self, *args, **kwargs):
        raise NotImplementedError(self.error)


def lang_class_is_loaded(lang: str) -> bool:
    """Check whether a Language class is already loaded. Language classes are
    loaded lazily, to avoid expensive setup code associated with the language
    data.

    lang (str): Two-letter language code, e.g. 'en'.
    RETURNS (bool): Whether a Language class has been loaded.
    """
    return lang in registry.languages


def get_lang_class(lang: str) -> "Language":
    """Import and load a Language class.

    lang (str): Two-letter language code, e.g. 'en'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
    if lang in registry.languages:
        return registry.languages.get(lang)
    else:
        try:
            module = importlib.import_module(f".lang.{lang}", "spacy")
        except ImportError as err:
            raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
        set_lang_class(lang, getattr(module, module.__all__[0]))
    return registry.languages.get(lang)


def set_lang_class(name: str, cls: Type["Language"]) -> None:
    """Set a custom Language class name that can be loaded via get_lang_class.

    name (str): Name of Language class.
    cls (Language): Language class.
    """
    registry.languages.register(name, func=cls)


def ensure_path(path: Any) -> Any:
    """Ensure string is converted to a Path.

    path (Any): Anything. If string, it's converted to Path.
    RETURNS: Path or original argument.
    """
    if isinstance(path, str):
        return Path(path)
    else:
        return path


def load_language_data(path: Union[str, Path]) -> Union[dict, list]:
    """Load JSON language data using the given path as a base. If the provided
    path isn't present, will attempt to load a gzipped version before giving up.

    path (str / Path): The data to load.
    RETURNS: The loaded data.
    """
    path = ensure_path(path)
    if path.exists():
        return srsly.read_json(path)
    path = path.with_suffix(path.suffix + ".gz")
    if path.exists():
        return srsly.read_gzip_json(path)
    raise ValueError(Errors.E160.format(path=path))


def get_module_path(module: ModuleType) -> Path:
    """Get the path of a Python module.

    module (ModuleType): The Python module.
    RETURNS (Path): The path.
    """
    if not hasattr(module, "__module__"):
        raise ValueError(Errors.E169.format(module=repr(module)))
    return Path(sys.modules[module.__module__].__file__).parent


def load_vectors_into_model(
    nlp: "Language", name: Union[str, Path], *, add_strings=True
) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    vectors_nlp = load_model(name)
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    if add_strings:
        # I guess we should add the strings from the vectors_nlp model?
        # E.g. if someone does a similarity query, they might expect the strings.
        for key in nlp.vocab.vectors.key2row:
            if key in vectors_nlp.vocab.strings:
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])


def load_vocab_data_into_model(
    nlp: "Language", *, lookups: Optional["Lookups"] = None
) -> None:
    """Load vocab data."""
    if lookups:
        nlp.vocab.lookups = lookups


def load_model(
    name: Union[str, Path],
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Load a model from a package or data path.

    name (str): Package name or model path.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
    if isinstance(name, str):  # name or string path
        if name.startswith("blank:"):  # shortcut for blank model
            return get_lang_class(name.replace("blank:", ""))()
        if is_package(name):  # installed as package
            return load_model_from_package(name, **kwargs)
        if Path(name).exists():  # path to model data directory
            return load_model_from_path(Path(name), **kwargs)
    elif hasattr(name, "exists"):  # Path or Path-like to model data
        return load_model_from_path(name, **kwargs)
    if name in OLD_MODEL_SHORTCUTS:
        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))
    raise IOError(Errors.E050.format(name=name))


def load_model_from_package(
    name: str,
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Load a model from an installed package.

    name (str): The package name.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    cls = importlib.import_module(name)
    return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)


def load_model_from_path(
    model_path: Union[str, Path],
    *,
    meta: Optional[Dict[str, Any]] = None,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Load a model from a data directory path. Creates Language class with
    pipeline from config.cfg and then calls from_disk() with path.

    name (str): Package name or model path.
    meta (Dict[str, Any]): Optional model meta.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    if not model_path.exists():
        raise IOError(Errors.E052.format(path=model_path))
    if not meta:
        meta = get_model_meta(model_path)
    config_path = model_path / "config.cfg"
    config = load_config(config_path, overrides=dict_to_dot(config))
    nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
    return nlp.from_disk(model_path, exclude=exclude)


def load_model_from_config(
    config: Union[Dict[str, Any], Config],
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    auto_fill: bool = False,
    validate: bool = True,
) -> "Language":
    """Create an nlp object from a config. Expects the full config file including
    a section "nlp" containing the settings for the nlp object.

    name (str): Package name or model path.
    meta (Dict[str, Any]): Optional model meta.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    auto_fill (bool): Whether to auto-fill config with missing defaults.
    validate (bool): Whether to show config validation errors.
    RETURNS (Language): The loaded nlp object.
    """
    if "nlp" not in config:
        raise ValueError(Errors.E985.format(config=config))
    nlp_config = config["nlp"]
    if "lang" not in nlp_config or nlp_config["lang"] is None:
        raise ValueError(Errors.E993.format(config=nlp_config))
    # This will automatically handle all codes registered via the languages
    # registry, including custom subclasses provided via entry points
    lang_cls = get_lang_class(nlp_config["lang"])
    nlp = lang_cls.from_config(
        config,
        vocab=vocab,
        disable=disable,
        exclude=exclude,
        auto_fill=auto_fill,
        validate=validate,
    )
    return nlp


def resolve_training_config(
    config: Config,
    exclude: Iterable[str] = ("nlp", "components"),
    validate: bool = True,
) -> Dict[str, Any]:
    """Resolve the config sections relevant for trainig and create all objects.
    Mostly used in the CLI to separate training config (not resolved by default
    because not runtime-relevant – an nlp object should load fine even if it's
    [training] block refers to functions that are not available etc.).

    config (Config): The config to resolve.
    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
        be available in the final resolved config.
    validate (bool): Whether to validate the config.
    RETURNS (Dict[str, Any]): The resolved config.
    """
    config = config.copy()
    for key in exclude:
        if key in config:
            config.pop(key)
    return registry.resolve(config, validate=validate)


def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Optional[Callable]]:
    """Resolve one or more "dot notation" names, e.g. corpora.train. 
    The paths could point anywhere into the config, so we don't know which
    top-level section we'll be looking within.
    
    We resolve the whole top-level section, although we could resolve less --
    we could find the lowest part of the tree.
    """
    resolved = {}
    output = []
    for name in dot_names:
        if name is None:
            output.append(name)
        else:
            section = name.split(".")[0]
            # We want to avoid resolving the same thing twice.
            if section not in resolved:
                resolved[section] = registry.resolve(config[section])
            output.append(dot_to_object(resolved, name))
    return output


def load_model_from_init_py(
    init_file: Union[Path, str],
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = SimpleFrozenList(),
    exclude: Iterable[str] = SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
    """Helper function to use in the `load()` method of a model package's
    __init__.py.

    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    model_path = Path(init_file).parent
    meta = get_model_meta(model_path)
    data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
    data_path = model_path / data_dir
    if not model_path.exists():
        raise IOError(Errors.E052.format(path=data_path))
    return load_model_from_path(
        data_path,
        vocab=vocab,
        meta=meta,
        disable=disable,
        exclude=exclude,
        config=config,
    )


def load_config(
    path: Union[str, Path],
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    interpolate: bool = False,
) -> Config:
    """Load a config file. Takes care of path validation and section order.

    path (Union[str, Path]): Path to the config file.
    overrides: (Dict[str, Any]): Config overrides as nested dict or
        dict keyed by section values in dot notation.
    interpolate (bool): Whether to interpolate and resolve variables.
    RETURNS (Config): The loaded config.
    """
    config_path = ensure_path(path)
    if not config_path.exists() or not config_path.is_file():
        raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
    return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
        config_path, overrides=overrides, interpolate=interpolate
    )


def load_config_from_str(
    text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
):
    """Load a full config from a string. Wrapper around Thinc's Config.from_str.

    text (str): The string config to load.
    interpolate (bool): Whether to interpolate and resolve variables.
    RETURNS (Config): The loaded config.
    """
    return Config(section_order=CONFIG_SECTION_ORDER).from_str(
        text, overrides=overrides, interpolate=interpolate,
    )


def get_installed_models() -> List[str]:
    """List all model packages currently installed in the environment.

    RETURNS (List[str]): The string names of the models.
    """
    return list(registry.models.get_all().keys())


def get_package_version(name: str) -> Optional[str]:
    """Get the version of an installed package. Typically used to get model
    package versions.

    name (str): The name of the installed Python package.
    RETURNS (str / None): The version or None if package not installed.
    """
    try:
        return importlib_metadata.version(name)
    except importlib_metadata.PackageNotFoundError:
        return None


def is_compatible_version(
    version: str, constraint: str, prereleases: bool = True
) -> Optional[bool]:
    """Check if a version (e.g. "2.0.0") is compatible given a version
    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
    it's interpreted as =={version}.

    version (str): The version to check.
    constraint (str): The constraint string.
    prereleases (bool): Whether to allow prereleases. If set to False,
        prerelease versions will be considered incompatible.
    RETURNS (bool / None): Whether the version is compatible, or None if the
        version or constraint are invalid.
    """
    # Handle cases where exact version is provided as constraint
    if constraint[0].isdigit():
        constraint = f"=={constraint}"
    try:
        spec = SpecifierSet(constraint)
        version = Version(version)
    except (InvalidSpecifier, InvalidVersion):
        return None
    spec.prereleases = prereleases
    return version in spec


def is_unconstrained_version(
    constraint: str, prereleases: bool = True
) -> Optional[bool]:
    # We have an exact version, this is the ultimate constrained version
    if constraint[0].isdigit():
        return False
    try:
        spec = SpecifierSet(constraint)
    except InvalidSpecifier:
        return None
    spec.prereleases = prereleases
    specs = [sp for sp in spec]
    # We only have one version spec and it defines > or >=
    if len(specs) == 1 and specs[0].operator in (">", ">="):
        return True
    # One specifier is exact version
    if any(sp.operator in ("==") for sp in specs):
        return False
    has_upper = any(sp.operator in ("<", "<=") for sp in specs)
    has_lower = any(sp.operator in (">", ">=") for sp in specs)
    # We have a version spec that defines an upper and lower bound
    if has_upper and has_lower:
        return False
    # Everything else, like only an upper version, only a lower version etc.
    return True


def get_model_version_range(spacy_version: str) -> str:
    """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
    version. Models are always compatible across patch versions but not
    across minor or major versions.
    """
    release = Version(spacy_version).release
    return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"


def get_base_version(version: str) -> str:
    """Generate the base version without any prerelease identifiers.

    version (str): The version, e.g. "3.0.0.dev1".
    RETURNS (str): The base version, e.g. "3.0.0".
    """
    return Version(version).base_version


def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
    """Load a model meta.json from a path and validate its contents.

    path (Union[str, Path]): Path to meta.json.
    RETURNS (Dict[str, Any]): The loaded meta.
    """
    path = ensure_path(path)
    if not path.parent.exists():
        raise IOError(Errors.E052.format(path=path.parent))
    if not path.exists() or not path.is_file():
        raise IOError(Errors.E053.format(path=path, name="meta.json"))
    meta = srsly.read_json(path)
    for setting in ["lang", "name", "version"]:
        if setting not in meta or not meta[setting]:
            raise ValueError(Errors.E054.format(setting=setting))
    if "spacy_version" in meta:
        if not is_compatible_version(about.__version__, meta["spacy_version"]):
            warn_msg = Warnings.W095.format(
                model=f"{meta['lang']}_{meta['name']}",
                model_version=meta["version"],
                version=meta["spacy_version"],
                current=about.__version__,
            )
            warnings.warn(warn_msg)
        if is_unconstrained_version(meta["spacy_version"]):
            warn_msg = Warnings.W094.format(
                model=f"{meta['lang']}_{meta['name']}",
                model_version=meta["version"],
                version=meta["spacy_version"],
                example=get_model_version_range(about.__version__),
            )
            warnings.warn(warn_msg)
    return meta


def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
    """Get model meta.json from a directory path and validate its contents.

    path (str / Path): Path to model directory.
    RETURNS (Dict[str, Any]): The model's meta data.
    """
    model_path = ensure_path(path)
    return load_meta(model_path / "meta.json")


def is_package(name: str) -> bool:
    """Check if string maps to a package installed via pip.

    name (str): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
    try:
        importlib_metadata.distribution(name)
        return True
    except:  # noqa: E722
        return False


def get_package_path(name: str) -> Path:
    """Get the path to an installed package.

    name (str): Package name.
    RETURNS (Path): Path to installed package.
    """
    name = name.lower()  # use lowercase version to be safe
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    pkg = importlib.import_module(name)
    return Path(pkg.__file__).parent


def split_command(command: str) -> List[str]:
    """Split a string command using shlex. Handles platform compatibility.

    command (str) : The command to split
    RETURNS (List[str]): The split command.
    """
    return shlex.split(command, posix=not is_windows)


def join_command(command: List[str]) -> str:
    """Join a command using shlex. shlex.join is only available for Python 3.8+,
    so we're using a workaround here.

    command (List[str]): The command to join.
    RETURNS (str): The joined command
    """
    return " ".join(shlex.quote(cmd) for cmd in command)


def run_command(
    command: Union[str, List[str]],
    *,
    stdin: Optional[Any] = None,
    capture: bool = False,
) -> Optional[subprocess.CompletedProcess]:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.

    command (str / List[str]): The command. If provided as a string, the
        string will be split using shlex.split.
    stdin (Optional[Any]): stdin to read from or None.
    capture (bool): Whether to capture the output and errors. If False,
        the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the returncode. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    RETURNS (Optional[CompletedProcess]): The process object.
    """
    if isinstance(command, str):
        cmd_list = split_command(command)
        cmd_str = command
    else:
        cmd_list = command
        cmd_str = " ".join(command)
    try:
        ret = subprocess.run(
            cmd_list,
            env=os.environ.copy(),
            input=stdin,
            encoding="utf8",
            check=False,
            stdout=subprocess.PIPE if capture else None,
            stderr=subprocess.STDOUT if capture else None,
        )
    except FileNotFoundError:
        # Indicates the *command* wasn't found, it's an error before the command
        # is run.
        raise FileNotFoundError(
            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
        ) from None
    if ret.returncode != 0 and capture:
        message = f"Error running command:\n\n{cmd_str}\n\n"
        message += f"Subprocess exited with status {ret.returncode}"
        if ret.stdout is not None:
            message += f"\n\nProcess log (stdout and stderr):\n\n"
            message += ret.stdout
        error = subprocess.SubprocessError(message)
        error.ret = ret
        error.command = cmd_str
        raise error
    elif ret.returncode != 0:
        sys.exit(ret.returncode)
    return ret


@contextmanager
def working_dir(path: Union[str, Path]) -> None:
    """Change current working directory and returns to previous on exit.

    path (str / Path): The directory to navigate to.
    YIELDS (Path): The absolute path to the current working directory. This
        should be used if the block needs to perform actions within the working
        directory, to prevent mismatches with relative paths.
    """
    prev_cwd = Path.cwd()
    current = Path(path).resolve()
    os.chdir(str(current))
    try:
        yield current
    finally:
        os.chdir(str(prev_cwd))


@contextmanager
def make_tempdir() -> Generator[Path, None, None]:
    """Execute a block in a temporary directory and remove the directory and
    its contents at the end of the with block.

    YIELDS (Path): The path of the temp directory.
    """
    d = Path(tempfile.mkdtemp())
    yield d
    try:
        shutil.rmtree(str(d))
    except PermissionError as e:
        warnings.warn(Warnings.W091.format(dir=d, msg=e))


def is_cwd(path: Union[Path, str]) -> bool:
    """Check whether a path is the current working directory.

    path (Union[Path, str]): The directory path.
    RETURNS (bool): Whether the path is the current working directory.
    """
    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()


def is_in_jupyter() -> bool:
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.
    RETURNS (bool): True if in Jupyter, False if not.
    """
    # https://stackoverflow.com/a/39662359/6400719
    try:
        shell = get_ipython().__class__.__name__
        if shell == "ZMQInteractiveShell":
            return True  # Jupyter notebook or qtconsole
    except NameError:
        return False  # Probably standard Python interpreter
    return False


def get_object_name(obj: Any) -> str:
    """Get a human-readable name of a Python object, e.g. a pipeline component.

    obj (Any): The Python object, typically a function or class.
    RETURNS (str): A human-readable name.
    """
    if hasattr(obj, "name"):
        return obj.name
    if hasattr(obj, "__name__"):
        return obj.__name__
    if hasattr(obj, "__class__") and hasattr(obj.__class__, "__name__"):
        return obj.__class__.__name__
    return repr(obj)


def is_same_func(func1: Callable, func2: Callable) -> bool:
    """Approximately decide whether two functions are the same, even if their
    identity is different (e.g. after they have been live reloaded). Mostly
    used in the @Language.component and @Language.factory decorators to decide
    whether to raise if a factory already exists. Allows decorator to run
    multiple times with the same function.

    func1 (Callable): The first function.
    func2 (Callable): The second function.
    RETURNS (bool): Whether it's the same function (most likely).
    """
    if not callable(func1) or not callable(func2):
        return False
    same_name = func1.__qualname__ == func2.__qualname__
    same_file = inspect.getfile(func1) == inspect.getfile(func2)
    same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2)
    return same_name and same_file and same_code


def get_cuda_stream(
    require: bool = False, non_blocking: bool = True
) -> Optional[CudaStream]:
    ops = get_current_ops()
    if CudaStream is None:
        return None
    elif isinstance(ops, NumpyOps):
        return None
    else:
        return CudaStream(non_blocking=non_blocking)


def get_async(stream, numpy_array):
    if cupy is None:
        return numpy_array
    else:
        array = cupy.ndarray(numpy_array.shape, order="C", dtype=numpy_array.dtype)
        array.set(numpy_array, stream=stream)
        return array


def read_regex(path: Union[str, Path]) -> Pattern:
    path = ensure_path(path)
    with path.open(encoding="utf8") as file_:
        entries = file_.read().split("\n")
    expression = "|".join(
        ["^" + re.escape(piece) for piece in entries if piece.strip()]
    )
    return re.compile(expression)


def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
    """Compile a sequence of prefix rules into a regex object.

    entries (Iterable[Union[str, Pattern]]): The prefix rules, e.g.
        spacy.lang.punctuation.TOKENIZER_PREFIXES.
    RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
    """
    expression = "|".join(["^" + piece for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
    """Compile a sequence of suffix rules into a regex object.

    entries (Iterable[Union[str, Pattern]]): The suffix rules, e.g.
        spacy.lang.punctuation.TOKENIZER_SUFFIXES.
    RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
    """
    expression = "|".join([piece + "$" for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
    """Compile a sequence of infix rules into a regex object.

    entries (Iterable[Union[str, Pattern]]): The infix rules, e.g.
        spacy.lang.punctuation.TOKENIZER_INFIXES.
    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
    """
    expression = "|".join([piece for piece in entries if piece.strip()])
    return re.compile(expression)


def add_lookups(default_func: Callable[[str], Any], *lookups) -> Callable[[str], Any]:
    """Extend an attribute function with special cases. If a word is in the
    lookups, the value is returned. Otherwise the previous function is used.

    default_func (callable): The default function to execute.
    *lookups (dict): Lookup dictionary mapping string to attribute value.
    RETURNS (callable): Lexical attribute getter.
    """
    # This is implemented as functools.partial instead of a closure, to allow
    # pickle to work.
    return functools.partial(_get_attr_unless_lookup, default_func, lookups)


def _get_attr_unless_lookup(
    default_func: Callable[[str], Any], lookups: Dict[str, Any], string: str
) -> Any:
    for lookup in lookups:
        if string in lookup:
            return lookup[string]
    return default_func(string)


def update_exc(
    base_exceptions: Dict[str, List[dict]], *addition_dicts
) -> Dict[str, List[dict]]:
    """Update and validate tokenizer exceptions. Will overwrite exceptions.

    base_exceptions (Dict[str, List[dict]]): Base exceptions.
    *addition_dicts (Dict[str, List[dict]]): Exceptions to add to the base dict, in order.
    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
    """
    exc = dict(base_exceptions)
    for additions in addition_dicts:
        for orth, token_attrs in additions.items():
            if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
                raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
            described_orth = "".join(attr[ORTH] for attr in token_attrs)
            if orth != described_orth:
                raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
        exc.update(additions)
    exc = expand_exc(exc, "'", "’")
    return exc


def expand_exc(
    excs: Dict[str, List[dict]], search: str, replace: str
) -> Dict[str, List[dict]]:
    """Find string in tokenizer exceptions, duplicate entry and replace string.
    For example, to add additional versions with typographic apostrophes.

    excs (Dict[str, List[dict]]): Tokenizer exceptions.
    search (str): String to find and replace.
    replace (str): Replacement.
    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
    """

    def _fix_token(token, search, replace):
        fixed = dict(token)
        fixed[ORTH] = fixed[ORTH].replace(search, replace)
        return fixed

    new_excs = dict(excs)
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            new_excs[new_key] = new_value
    return new_excs


def normalize_slice(
    length: int, start: int, stop: int, step: Optional[int] = None
) -> Tuple[int, int]:
    if not (step is None or step == 1):
        raise ValueError(Errors.E057)
    if start is None:
        start = 0
    elif start < 0:
        start += length
    start = min(length, max(0, start))
    if stop is None:
        stop = length
    elif stop < 0:
        stop += length
    stop = min(length, max(start, stop))
    return start, stop


def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
    creating named entities (where one token can only be part of one entity) or
    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
    longest span is preferred over shorter spans.

    spans (Iterable[Span]): The spans to filter.
    RETURNS (List[Span]): The filtered spans.
    """
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result


def to_bytes(getters: Dict[str, Callable[[], bytes]], exclude: Iterable[str]) -> bytes:
    return srsly.msgpack_dumps(to_dict(getters, exclude))


def from_bytes(
    bytes_data: bytes,
    setters: Dict[str, Callable[[bytes], Any]],
    exclude: Iterable[str],
) -> None:
    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)


def to_dict(
    getters: Dict[str, Callable[[], Any]], exclude: Iterable[str]
) -> Dict[str, Any]:
    serialized = {}
    for key, getter in getters.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude:
            serialized[key] = getter()
    return serialized


def from_dict(
    msg: Dict[str, Any],
    setters: Dict[str, Callable[[Any], Any]],
    exclude: Iterable[str],
) -> Dict[str, Any]:
    for key, setter in setters.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude and key in msg:
            setter(msg[key])
    return msg


def to_disk(
    path: Union[str, Path],
    writers: Dict[str, Callable[[Path], None]],
    exclude: Iterable[str],
) -> Path:
    path = ensure_path(path)
    if not path.exists():
        path.mkdir()
    for key, writer in writers.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude:
            writer(path / key)
    return path


def from_disk(
    path: Union[str, Path],
    readers: Dict[str, Callable[[Path], None]],
    exclude: Iterable[str],
) -> Path:
    path = ensure_path(path)
    for key, reader in readers.items():
        # Split to support file names like meta.json
        if key.split(".")[0] not in exclude:
            reader(path / key)
    return path


def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
    """Import module from a file. Used to load models from a directory.

    name (str): Name of module to load.
    loc (str / Path): Path to the file.
    RETURNS: The loaded module.
    """
    loc = str(loc)
    spec = importlib.util.spec_from_file_location(name, str(loc))
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


def minify_html(html: str) -> str:
    """Perform a template-specific, rudimentary HTML minification for displaCy.
    Disclaimer: NOT a general-purpose solution, only removes indentation and
    newlines.

    html (str): Markup to minify.
    RETURNS (str): "Minified" HTML.
    """
    return html.strip().replace("    ", "").replace("\n", "")


def escape_html(text: str) -> str:
    """Replace <, >, &, " with their HTML encoded representation. Intended to
    prevent HTML errors in rendered displaCy markup.

    text (str): The original text.
    RETURNS (str): Equivalent text to be safely used within HTML.
    """
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    text = text.replace('"', "&quot;")
    return text


def get_words_and_spaces(
    words: Iterable[str], text: str
) -> Tuple[List[str], List[bool]]:
    """Given a list of words and a text, reconstruct the original tokens and
    return a list of words and spaces that can be used to create a Doc. This
    can help recover destructive tokenization that didn't preserve any
    whitespace information.

    words (Iterable[str]): The words.
    text (str): The original text.
    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
    """
    if "".join("".join(words).split()) != "".join(text.split()):
        raise ValueError(Errors.E194.format(text=text, words=words))
    text_words = []
    text_spaces = []
    text_pos = 0
    # normalize words to remove all whitespace tokens
    norm_words = [word for word in words if not word.isspace()]
    # align words with text
    for word in norm_words:
        try:
            word_start = text[text_pos:].index(word)
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words)) from None
        if word_start > 0:
            text_words.append(text[text_pos : text_pos + word_start])
            text_spaces.append(False)
            text_pos += word_start
        text_words.append(word)
        text_spaces.append(False)
        text_pos += len(word)
        if text_pos < len(text) and text[text_pos] == " ":
            text_spaces[-1] = True
            text_pos += 1
    if text_pos < len(text):
        text_words.append(text[text_pos:])
        text_spaces.append(False)
    return (text_words, text_spaces)


def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
    """Deep copy a Config. Will raise an error if the config contents are not
    JSON-serializable.

    config (Config): The config to copy.
    RETURNS (Config): The copied config.
    """
    try:
        return Config(config).copy()
    except ValueError:
        raise ValueError(Errors.E961.format(config=config)) from None


def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
    """Convert dot notation to a dict. For example: {"token.pos": True,
    "token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.

    values (Dict[str, Any]): The key/value pairs to convert.
    RETURNS (Dict[str, dict]): The converted values.
    """
    result = {}
    for key, value in values.items():
        path = result
        parts = key.lower().split(".")
        for i, item in enumerate(parts):
            is_last = i == len(parts) - 1
            path = path.setdefault(item, value if is_last else {})
    return result


def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
    """Convert dot notation to a dict. For example: {"token": {"pos": True,
    "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.

    values (Dict[str, dict]): The dict to convert.
    RETURNS (Dict[str, Any]): The key/value pairs.
    """
    return {".".join(key): value for key, value in walk_dict(obj)}


def dot_to_object(config: Config, section: str):
    """Convert dot notation of a "section" to a specific part of the Config.
    e.g. "training.optimizer" would return the Optimizer object.
    Throws an error if the section is not defined in this config.

    config (Config): The config.
    section (str): The dot notation of the section in the config.
    RETURNS: The object denoted by the section
    """
    component = config
    parts = section.split(".")
    for item in parts:
        try:
            component = component[item]
        except (KeyError, TypeError):
            raise KeyError(Errors.E952.format(name=section)) from None
    return component


def walk_dict(
    node: Dict[str, Any], parent: List[str] = []
) -> Iterator[Tuple[List[str], Any]]:
    """Walk a dict and yield the path and values of the leaves."""
    for key, value in node.items():
        key_parent = [*parent, key]
        if isinstance(value, dict):
            yield from walk_dict(value, key_parent)
        else:
            yield (key_parent, value)


def get_arg_names(func: Callable) -> List[str]:
    """Get a list of all named arguments of a function (regular,
    keyword-only).

    func (Callable): The function
    RETURNS (List[str]): The argument names.
    """
    argspec = inspect.getfullargspec(func)
    return list(set([*argspec.args, *argspec.kwonlyargs]))


def combine_score_weights(
    weights: List[Dict[str, float]],
    overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
) -> Dict[str, float]:
    """Combine and normalize score weights defined by components, e.g.
    {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.

    weights (List[dict]): The weights defined by the components.
    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
        should be preserved.
    RETURNS (Dict[str, float]): The combined and normalized weights.
    """
    # We first need to extract all None/null values for score weights that
    # shouldn't be shown in the table *or* be weighted
    result = {}
    all_weights = []
    for w_dict in weights:
        filtered_weights = {}
        for key, value in w_dict.items():
            value = overrides.get(key, value)
            if value is None:
                result[key] = None
            else:
                filtered_weights[key] = value
        all_weights.append(filtered_weights)
    for w_dict in all_weights:
        # We need to account for weights that don't sum to 1.0 and normalize
        # the score weights accordingly, then divide score by the number of
        # components.
        total = sum(w_dict.values())
        for key, value in w_dict.items():
            if total == 0:
                weight = 0.0
            else:
                weight = round(value / total / len(all_weights), 2)
            prev_weight = result.get(key, 0.0)
            prev_weight = 0.0 if prev_weight is None else prev_weight
            result[key] = prev_weight + weight
    return result


class DummyTokenizer:
    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **kwargs):
        return b""

    def from_bytes(self, _bytes_data, **kwargs):
        return self

    def to_disk(self, _path, **kwargs):
        return None

    def from_disk(self, _path, **kwargs):
        return self


def create_default_optimizer() -> Optimizer:
    return Adam()


def minibatch(items, size):
    """Iterate over batches of items. `size` may be an iterator,
    so that batch-size can vary on each step.
    """
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = list(itertools.islice(items, int(batch_size)))
        if len(batch) == 0:
            break
        yield list(batch)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
-												Allow spacy project to push and pull to/from remote storage (#5949)

* Add utils for working with remote storage

* WIP add remote_cache for project

* WIP add push and pull commands

* Use pathy in remote_cache

* Updarte util

* Update remote_cache

* Update util

* Update project assets

* Update pull script

* Update push script

* Fix type annotation in util

* Work on remote storage

* Remove site and env hash

* Fix imports

* Fix type annotation

* Require pathy

* Require pathy

* Fix import

* Add a util to handle project variable substitution

* Import push and pull commands

* Fix pull command

* Fix push command

* Fix tarfile in remote_storage

* Improve printing

* Fiddle with status messages

* Set version to v3.0.0a9

* Draft docs for spacy project remote storages

* Update docs [ci skip]

* Use Thinc config to simplify and unify template variables

* Auto-format

* Don't import Pathy globally for now

Causes slow and annoying Google Cloud warning

* Tidy up test

* Tidy up and update tests

* Update to latest Thinc

* Update docs

* variables -> vars

* Update docs [ci skip]

* Update docs [ci skip]

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-23 19:32:09 +03:00
+								from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								from types import ModuleType
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								import os
-												Move is_package and get_model_package_path to util

											
										
										
											2017-05-08 00:24:51 +03:00
+								import importlib
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								import importlib.util
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 10:05:22 +03:00
+								import re
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								from pathlib import Path
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								import thinc
-												Update docs and types

											
										
										
											2020-07-31 18:02:54 +03:00
+								from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								import functools
-												Add missing import (fixes #1546)

											
										
										
											2017-11-10 21:05:18 +03:00
+								import itertools
-												Add contributor agreement for emulbreh

											
										
										
											2018-02-13 14:52:48 +03:00
+								import numpy.random
-												Use max(uint64) for OOV lexeme rank (#5303)

* Use max(uint64) for OOV lexeme rank

* Add test for default OOV rank

* Revert back to thinc==7.4.0

Requiring the updated version of thinc was unnecessary.

* Define OOV_RANK in one place

Define OOV_RANK in one place in `util`.

* Fix formatting [ci skip]

* Switch to external definitions of max(uint64)

Switch to external defintions of max(uint64) and confirm that they are
equal.
											
										
										
											2020-04-15 14:49:47 +03:00
+								import numpy
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								import srsly
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								import catalogue
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								import sys
-												Add missing import
											
										
										
											2020-04-28 15:01:29 +03:00
+								import warnings
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								from packaging.specifiers import SpecifierSet, InvalidSpecifier
 								from packaging.version import Version, InvalidVersion
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								import subprocess
 								from contextlib import contextmanager
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
+								import tempfile
 								import shutil
-												split_command util function

											
										
										
											2020-06-30 13:54:15 +03:00
+								import shlex
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								import inspect
-												Replace lexeme_norm warning with logging

											
										
										
											2020-08-14 16:00:52 +03:00
+								import logging
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
-												Set cupy.random seed in fix_random_seed helper

											
										
										
											2018-12-08 14:37:38 +03:00
+								try:
 								    import cupy.random
 								except ImportError:
 								    cupy = None
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								try:  # Python 3.8
 								    import importlib.metadata as importlib_metadata
 								except ImportError:
 								    import importlib_metadata
-												Remove dead and/or deprecated code (#5710)

* Remove dead and/or deprecated code

* Remove n_threads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-06 14:06:25 +03:00
+								# These are functions that were previously (v2.x) available from spacy.util
 								# and have since moved to Thinc. We're importing them here so people's code
 								# doesn't break, but they should always be imported from Thinc from now on,
 								# not from spacy.util.
 								from thinc.api import fix_random_seed, compounding, decaying  # noqa: F401
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								from .symbols import ORTH
-												split_command util function

											
										
										
											2020-06-30 13:54:15 +03:00
+								from .compat import cupy, CudaStream, is_windows
-												Add better error for failed model shortcut loading

											
										
										
											2020-08-06 00:10:29 +03:00
+								from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								from . import about
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								if TYPE_CHECKING:
 								    # This lets us add type hints for mypy etc. without causing circular imports
 								    from .language import Language  # noqa: F401
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    from .tokens import Doc, Span  # noqa: F401
 								    from .vocab import Vocab  # noqa: F401
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
-												Handle raw_input vs input in Python 2 and 3

											
										
										
											2017-03-21 00:48:32 +03:00
-												Use max(uint64) for OOV lexeme rank (#5303)

* Use max(uint64) for OOV lexeme rank

* Add test for default OOV rank

* Revert back to thinc==7.4.0

Requiring the updated version of thinc was unnecessary.

* Define OOV_RANK in one place

Define OOV_RANK in one place in `util`.

* Fix formatting [ci skip]

* Switch to external definitions of max(uint64)

Switch to external defintions of max(uint64) and confirm that they are
equal.
											
										
										
											2020-04-15 14:49:47 +03:00
+								OOV_RANK = numpy.iinfo(numpy.uint64).max
-												Improve vocab data integration and warning

											
										
										
											2020-07-25 12:51:30 +03:00
+								LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								# Default order of sections in the config.cfg. Not all sections needs to exist,
 								# and additional sections are added at the end, in alphabetical order.
 								# fmt: off
-												Adjust sort order [ci skip]

											
										
										
											2020-09-23 21:03:04 +03:00
+								CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								# fmt: on
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
-												Replace lexeme_norm warning with logging

											
										
										
											2020-08-14 16:00:52 +03:00
+								logging.basicConfig()
 								logger = logging.getLogger("spacy")
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								class registry(thinc.registry):
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    languages = catalogue.create("spacy", "languages", entry_points=True)
 								    architectures = catalogue.create("spacy", "architectures", entry_points=True)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True)
 								    lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    lookups = catalogue.create("spacy", "lookups", entry_points=True)
 								    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
-												registry.assets -> registry.misc

											
										
										
											2020-09-03 18:31:14 +03:00
+								    misc = catalogue.create("spacy", "misc", entry_points=True)
-												Add config callbacks for modifying nlp object before and after init (#5866)

* WIP: Concept for modifying nlp object before and after init

* Make callbacks return nlp object

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Raise if callbacks don't return correct type

* Rename, update types, add after_pipeline_creation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-05 20:47:54 +03:00
+								    # Callback functions used to manipulate nlp object etc.
 								    callbacks = catalogue.create("spacy", "callbacks")
-												Create corpus iterator and batcher from registry during training (#5865)

* Move batchers into their own module (and registry)

* Update CLI

* Update Corpus and batcher

* Update tests

* Update one config

* Merge 'evaluation' block back under [training]

* Import batchers in gold __init__

* Fix batchers

* Update config

* Update schema

* Update util

* Don't assume train and dev are actually paths

* Update onto-joint config

* Fix missing import

* Format

* Format

* Update spacy/gold/corpus.py

Co-authored-by: Ines Montani <ines@ines.io>

* Fix name

* Update default config

* Fix get_length option in batchers

* Update test

* Add comment

* Pass path into Corpus

* Update docstring

* Update schema and configs

* Update config

* Fix test

* Fix paths

* Fix print

* Fix create_train_batches

* [training.read_train] -> [training.train_corpus]

* Update onto-joint config

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-04 16:09:37 +03:00
+								    batchers = catalogue.create("spacy", "batchers", entry_points=True)
 								    readers = catalogue.create("spacy", "readers", entry_points=True)
-												Weights & Biases logger for train CLI (#5971)

* quick test as part of train script

* train_logger in config, default ConsoleLogger in loggers catalogue

* entitiy typo

* add wandb_logger

* cleanup

* Update spacy/cli/train_logger.py

Co-authored-by: Ines Montani <ines@ines.io>

* move loggers to gold.loggers

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-26 16:24:33 +03:00
+								    loggers = catalogue.create("spacy", "loggers", entry_points=True)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    # These are factories registered via third-party packages and the
 								    # spacy_factories entry point. This registry only exists so we can easily
 								    # load them via the entry points. The "true" factories are added via the
 								    # Language.factory decorator (in the spaCy code base and user code) and those
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    # are the factories used to initialize components via registry.resolve.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    _entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
 								    factories = catalogue.create("spacy", "internal_factories")
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    # This is mostly used to get a list of all installed models in the current
 								    # environment. spaCy models packaged with `spacy package` will "advertise"
 								    # themselves via entry points.
 								    models = catalogue.create("spacy", "models", entry_points=True)
-												Add CLI registry (#6037)


											
										
										
											2020-09-08 16:23:34 +03:00
+								    cli = catalogue.create("spacy", "cli", entry_points=True)
-												Move lookup tables out of the core library (#4346)

* Add default to util.get_entry_point

* Tidy up entry points

* Read lookups from entry points

* Remove lookup tables and related tests

* Add lookups install option

* Remove lemmatizer tests

* Remove logic to process language data files

* Update setup.cfg

											
										
										
											2019-10-01 01:01:27 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								class SimpleFrozenDict(dict):
 								    """Simplified implementation of a frozen dict, mainly used as default
 								    function or method argument (for arguments that should default to empty
 								    dictionary). Will raise an error if user or spaCy attempts to add to dict.
 								    """
 								    def __init__(self, *args, error: str = Errors.E095, **kwargs) -> None:
 								        """Initialize the frozen dict. Can be initialized with pre-defined
 								        values.
 								        error (str): The error message when user tries to assign to dict.
 								        """
 								        super().__init__(*args, **kwargs)
 								        self.error = error
 								    def __setitem__(self, key, value):
 								        raise NotImplementedError(self.error)
 								    def pop(self, key, default=None):
 								        raise NotImplementedError(self.error)
 								    def update(self, other):
 								        raise NotImplementedError(self.error)
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								class SimpleFrozenList(list):
 								    """Wrapper class around a list that lets us raise custom errors if certain
 								    attributes/methods are accessed. Mostly used for properties like
 								    Language.pipeline that return an immutable list (and that we don't want to
 								    convert to a tuple to not break too much backwards compatibility). If a user
 								    accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
 								    """
 								    def __init__(self, *args, error: str = Errors.E927) -> None:
 								        """Initialize the frozen list.
 								        error (str): The error message when user tries to mutate the list.
 								        """
 								        self.error = error
 								        super().__init__(*args)
 								    def append(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def clear(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def extend(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def insert(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def pop(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def remove(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def reverse(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
 								    def sort(self, *args, **kwargs):
 								        raise NotImplementedError(self.error)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def lang_class_is_loaded(lang: str) -> bool:
-												Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system

											
										
										
											2019-03-11 17:23:20 +03:00
+								    """Check whether a Language class is already loaded. Language classes are
 								    loaded lazily, to avoid expensive setup code associated with the language
 								    data.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    lang (str): Two-letter language code, e.g. 'en'.
-												Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system

											
										
										
											2019-03-11 17:23:20 +03:00
+								    RETURNS (bool): Whether a Language class has been loaded.
 								    """
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    return lang in registry.languages
-												Auto-format [ci skip]

											
										
										
											2019-03-11 19:10:50 +03:00
-												Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system

											
										
										
											2019-03-11 17:23:20 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_lang_class(lang: str) -> "Language":
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    """Import and load a Language class.
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    lang (str): Two-letter language code, e.g. 'en'.
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    RETURNS (Language): Language class.
 								    """
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    # Check if language is registered / entry point is available
 								    if lang in registry.languages:
 								        return registry.languages.get(lang)
 								    else:
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								        try:
-												More formatting changes

											
										
										
											2019-12-25 19:59:52 +03:00
+								            module = importlib.import_module(f".lang.{lang}", "spacy")
-												Also raise original error message in util.get_lang_class

Otherwise, the true error that happens within a Language subclass is swallowed, because if it's imported lazily like that, it'll always be an ImportError

											
										
										
											2019-02-13 18:52:25 +03:00
+								        except ImportError as err:
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								            raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								        set_lang_class(lang, getattr(module, module.__all__[0]))
 								    return registry.languages.get(lang)
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def set_lang_class(name: str, cls: Type["Language"]) -> None:
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    """Set a custom Language class name that can be loaded via get_lang_class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Name of Language class.
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    cls (Language): Language class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Replace function registries with catalogue (#4584)

* Replace functions registries with catalogue

* Update __init__.py

* Fix test

* Revert unrelated flag [ci skip]

											
										
										
											2019-11-07 13:45:22 +03:00
+								    registry.languages.register(name, func=cls)
-												Add load_lang_class() util function

											
										
										
											2017-05-09 00:50:45 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def ensure_path(path: Any) -> Any:
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    """Ensure string is converted to a Path.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    path (Any): Anything. If string, it's converted to Path.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    RETURNS: Path or original argument.
 								    """
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    if isinstance(path, str):
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								        return Path(path)
 								    else:
 								        return path
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def load_language_data(path: Union[str, Path]) -> Union[dict, list]:
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    """Load JSON language data using the given path as a base. If the provided
 								    path isn't present, will attempt to load a gzipped version before giving up.
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    path (str / Path): The data to load.
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    RETURNS: The loaded data.
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
+								    """
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    path = ensure_path(path)
 								    if path.exists():
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
+								        return srsly.read_json(path)
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    path = path.with_suffix(path.suffix + ".gz")
 								    if path.exists():
 								        return srsly.read_gzip_json(path)
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    raise ValueError(Errors.E160.format(path=path))
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_module_path(module: ModuleType) -> Path:
 								    """Get the path of a Python module.
 								    module (ModuleType): The Python module.
 								    RETURNS (Path): The path.
 								    """
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    if not hasattr(module, "__module__"):
-												Improve Morphology errors (#4314)

* Improve Morphology errors

* Also clean up some other errors

* Update errors.py

											
										
										
											2019-09-21 15:37:06 +03:00
+								        raise ValueError(Errors.E169.format(module=repr(module)))
-												💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167)

* Improve load_language_data helper

* WIP: Add Lookups implementation

* Start moving lemma data over to JSON

* WIP: move data over for more languages

* Convert more languages

* Fix lemmatizer fixtures in tests

* Finish conversion

* Auto-format JSON files

* Fix test for now

* Make sure tables are stored on instance

											
										
										
											2019-08-22 15:21:32 +03:00
+								    return Path(sys.modules[module.__module__].__file__).parent
-												Reduce size of language data (#4141)

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest explosion/srsly#9

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths

											
										
										
											2019-08-20 15:54:11 +03:00
-												Add load_vectors_into_model util

											
										
										
											2020-07-28 22:56:02 +03:00
+								def load_vectors_into_model(
-												Format

											
										
										
											2020-07-28 23:02:34 +03:00
+								    nlp: "Language", name: Union[str, Path], *, add_strings=True
-												Add load_vectors_into_model util

											
										
										
											2020-07-28 22:56:02 +03:00
+								) -> None:
 								    """Load word vectors from an installed model or path into a model instance."""
 								    vectors_nlp = load_model(name)
 								    nlp.vocab.vectors = vectors_nlp.vocab.vectors
 								    if add_strings:
 								        # I guess we should add the strings from the vectors_nlp model?
 								        # E.g. if someone does a similarity query, they might expect the strings.
 								        for key in nlp.vocab.vectors.key2row:
-												Fix load_vectors_into_model function

											
										
										
											2020-07-28 22:59:51 +03:00
+								            if key in vectors_nlp.vocab.strings:
 								                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
-												Add load_vectors_into_model util

											
										
										
											2020-07-28 22:56:02 +03:00
-												Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```

											
										
										
											2020-09-18 16:45:55 +03:00
+								def load_vocab_data_into_model(
-												Format

											
										
										
											2020-09-20 17:31:48 +03:00
+								    nlp: "Language", *, lookups: Optional["Lookups"] = None
-												Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```

											
										
										
											2020-09-18 16:45:55 +03:00
+								) -> None:
 								    """Load vocab data."""
 								    if lookups:
-												Minor renaming / refactoring

* Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message
* Make `Vocab.lookups` a property

											
										
										
											2020-09-18 20:43:19 +03:00
+								        nlp.vocab.lookups = lookups
-												Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```

											
										
										
											2020-09-18 16:45:55 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model(
 								    name: Union[str, Path],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Remove symlinks, data dir and related stuff

											
										
										
											2020-02-18 19:20:17 +03:00
+								    """Load a model from a package or data path.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Package name or model path.
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Re-add setting for vocab data and tidy up

											
										
										
											2020-07-25 13:14:28 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable.
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
-												Re-add setting for vocab data and tidy up

											
										
										
											2020-07-25 13:14:28 +03:00
+								    RETURNS (Language): The loaded nlp object.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
-												Remove symlinks, data dir and related stuff

											
										
										
											2020-02-18 19:20:17 +03:00
+								    if isinstance(name, str):  # name or string path
-												 Add blank:{lang} shortcut to util.load_mode

											
										
										
											2020-05-21 21:24:07 +03:00
+								        if name.startswith("blank:"):  # shortcut for blank model
 								            return get_lang_class(name.replace("blank:", ""))()
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        if is_package(name):  # installed as package
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								            return load_model_from_package(name, **kwargs)
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        if Path(name).exists():  # path to model data directory
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								            return load_model_from_path(Path(name), **kwargs)
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    elif hasattr(name, "exists"):  # Path or Path-like to model data
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								        return load_model_from_path(name, **kwargs)
-												Add better error for failed model shortcut loading

											
										
										
											2020-08-06 00:10:29 +03:00
+								    if name in OLD_MODEL_SHORTCUTS:
 								        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								    raise IOError(Errors.E050.format(name=name))
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_package(
 								    name: str,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    """Load a model from an installed package.
 								    name (str): The package name.
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
 								    RETURNS (Language): The loaded nlp object.
 								    """
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    cls = importlib.import_module(name)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_path(
 								    model_path: Union[str, Path],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    meta: Optional[Dict[str, Any]] = None,
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    """Load a model from a data directory path. Creates Language class with
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    pipeline from config.cfg and then calls from_disk() with path.
 								    name (str): Package name or model path.
 								    meta (Dict[str, Any]): Optional model meta.
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
 								    RETURNS (Language): The loaded nlp object.
 								    """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    if not model_path.exists():
 								        raise IOError(Errors.E052.format(path=model_path))
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    if not meta:
 								        meta = get_model_meta(model_path)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    config_path = model_path / "config.cfg"
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    config = load_config(config_path, overrides=dict_to_dot(config))
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    return nlp.from_disk(model_path, exclude=exclude)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_config(
 								    config: Union[Dict[str, Any], Config],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    auto_fill: bool = False,
 								    validate: bool = True,
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								) -> "Language":
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    """Create an nlp object from a config. Expects the full config file including
 								    a section "nlp" containing the settings for the nlp object.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
 								    name (str): Package name or model path.
 								    meta (Dict[str, Any]): Optional model meta.
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    auto_fill (bool): Whether to auto-fill config with missing defaults.
 								    validate (bool): Whether to show config validation errors.
 								    RETURNS (Language): The loaded nlp object.
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    """
 								    if "nlp" not in config:
 								        raise ValueError(Errors.E985.format(config=config))
 								    nlp_config = config["nlp"]
-												util function dot_to_object and corresponding unit test

											
										
										
											2020-07-27 18:50:12 +03:00
+								    if "lang" not in nlp_config or nlp_config["lang"] is None:
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        raise ValueError(Errors.E993.format(config=nlp_config))
 								    # This will automatically handle all codes registered via the languages
 								    # registry, including custom subclasses provided via entry points
 								    lang_cls = get_lang_class(nlp_config["lang"])
 								    nlp = lang_cls.from_config(
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        config,
 								        vocab=vocab,
 								        disable=disable,
 								        exclude=exclude,
 								        auto_fill=auto_fill,
 								        validate=validate,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    )
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    return nlp
 								def resolve_training_config(
 								    config: Config,
 								    exclude: Iterable[str] = ("nlp", "components"),
 								    validate: bool = True,
 								) -> Dict[str, Any]:
 								    """Resolve the config sections relevant for trainig and create all objects.
 								    Mostly used in the CLI to separate training config (not resolved by default
 								    because not runtime-relevant – an nlp object should load fine even if it's
 								    [training] block refers to functions that are not available etc.).
 								    config (Config): The config to resolve.
 								    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
 								        be available in the final resolved config.
 								    validate (bool): Whether to validate the config.
 								    RETURNS (Dict[str, Any]): The resolved config.
 								    """
 								    config = config.copy()
 								    for key in exclude:
 								        if key in config:
-												Fix exclude and add test

											
										
										
											2020-09-28 00:34:03 +03:00
+								            config.pop(key)
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    return registry.resolve(config, validate=validate)
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 20:42:27 +03:00
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Optional[Callable]]:
 								    """Resolve one or more "dot notation" names, e.g. corpora.train.
 								    The paths could point anywhere into the config, so we don't know which
 								    top-level section we'll be looking within.
 								    We resolve the whole top-level section, although we could resolve less --
 								    we could find the lowest part of the tree.
 								    """
 								    resolved = {}
 								    output = []
 								    for name in dot_names:
 								        if name is None:
 								            output.append(name)
 								        else:
 								            section = name.split(".")[0]
 								            # We want to avoid resolving the same thing twice.
 								            if section not in resolved:
-												Remove schema=None until Optional

											
										
										
											2020-09-28 04:42:58 +03:00
+								                resolved[section] = registry.resolve(config[section])
-												Add (untested) resolve_dot_names util

											
										
										
											2020-09-28 04:06:12 +03:00
+								            output.append(dot_to_object(resolved, name))
 								    return output
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def load_model_from_init_py(
 								    init_file: Union[Path, str],
-												Allow adding pipeline components from source model (#5857)

* Allow adding pipeline components from source model

* Config: name -> component

* Improve error messages

* Fix error and test

* Add frozen components and exclude logic

* Remove exclude from Language.evaluate

* Init sourced components with current vocab

* Fix error codes
											
										
										
											2020-08-05 00:39:19 +03:00
+								    *,
 								    vocab: Union["Vocab", bool] = True,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 16:20:11 +03:00
+								    disable: Iterable[str] = SimpleFrozenList(),
 								    exclude: Iterable[str] = SimpleFrozenList(),
-												Simplify config overrides in CLI and deserialization (#5880)


											
										
										
											2020-08-06 00:35:09 +03:00
+								    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								) -> "Language":
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """Helper function to use in the `load()` method of a model package's
 								    __init__.py.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
 								    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
 								        a new Vocab object will be created.
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								    disable (Iterable[str]): Names of pipeline components to disable. Disabled
 								        pipes will be loaded but they won't be run unless you explicitly
 								        enable them by calling nlp.enable_pipe.
 								    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
 								        components won't be loaded.
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
 								        keyed by section values in dot notation.
 								    RETURNS (Language): The loaded nlp object.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """
 								    model_path = Path(init_file).parent
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    meta = get_model_meta(model_path)
-												More formatting changes

											
										
										
											2019-12-25 19:59:52 +03:00
+								    data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    data_path = model_path / data_dir
 								    if not model_path.exists():
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        raise IOError(Errors.E052.format(path=data_path))
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    return load_model_from_path(
-												Allow loaded but disabled components

											
										
										
											2020-08-28 16:20:14 +03:00
+								        data_path,
 								        vocab=vocab,
 								        meta=meta,
 								        disable=disable,
 								        exclude=exclude,
 								        config=config,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    )
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								def load_config(
 								    path: Union[str, Path],
 								    overrides: Dict[str, Any] = SimpleFrozenDict(),
 								    interpolate: bool = False,
 								) -> Config:
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    """Load a config file. Takes care of path validation and section order.
 								    path (Union[str, Path]): Path to the config file.
 								    overrides: (Dict[str, Any]): Config overrides as nested dict or
 								        dict keyed by section values in dot notation.
 								    interpolate (bool): Whether to interpolate and resolve variables.
 								    RETURNS (Config): The loaded config.
 								    """
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    config_path = ensure_path(path)
 								    if not config_path.exists() or not config_path.is_file():
 								        raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
 								    return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
 								        config_path, overrides=overrides, interpolate=interpolate
 								    )
 								def load_config_from_str(
 								    text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
 								):
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    """Load a full config from a string. Wrapper around Thinc's Config.from_str.
 								    text (str): The string config to load.
 								    interpolate (bool): Whether to interpolate and resolve variables.
 								    RETURNS (Config): The loaded config.
 								    """
-												Update Thinc and include section order

											
										
										
											2020-08-14 15:06:22 +03:00
+								    return Config(section_order=CONFIG_SECTION_ORDER).from_str(
 								        text, overrides=overrides, interpolate=interpolate,
 								    )
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_installed_models() -> List[str]:
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """List all model packages currently installed in the environment.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    RETURNS (List[str]): The string names of the models.
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """
 								    return list(registry.models.get_all().keys())
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_package_version(name: str) -> Optional[str]:
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """Get the version of an installed package. Typically used to get model
 								    package versions.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): The name of the installed Python package.
 								    RETURNS (str / None): The version or None if package not installed.
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								    """
 								    try:
 								        return importlib_metadata.version(name)
 								    except importlib_metadata.PackageNotFoundError:
 								        return None
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_compatible_version(
 								    version: str, constraint: str, prereleases: bool = True
 								) -> Optional[bool]:
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    """Check if a version (e.g. "2.0.0") is compatible given a version
 								    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
 								    it's interpreted as =={version}.
 								    version (str): The version to check.
 								    constraint (str): The constraint string.
 								    prereleases (bool): Whether to allow prereleases. If set to False,
 								        prerelease versions will be considered incompatible.
 								    RETURNS (bool / None): Whether the version is compatible, or None if the
 								        version or constraint are invalid.
 								    """
 								    # Handle cases where exact version is provided as constraint
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    if constraint[0].isdigit():
 								        constraint = f"=={constraint}"
 								    try:
 								        spec = SpecifierSet(constraint)
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								        version = Version(version)
 								    except (InvalidSpecifier, InvalidVersion):
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
+								        return None
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    spec.prereleases = prereleases
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    return version in spec
-												Better model compatibility and validation

											
										
										
											2020-05-22 16:42:46 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_unconstrained_version(
 								    constraint: str, prereleases: bool = True
 								) -> Optional[bool]:
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								    # We have an exact version, this is the ultimate constrained version
 								    if constraint[0].isdigit():
 								        return False
 								    try:
 								        spec = SpecifierSet(constraint)
 								    except InvalidSpecifier:
 								        return None
 								    spec.prereleases = prereleases
 								    specs = [sp for sp in spec]
 								    # We only have one version spec and it defines > or >=
 								    if len(specs) == 1 and specs[0].operator in (">", ">="):
 								        return True
 								    # One specifier is exact version
 								    if any(sp.operator in ("==") for sp in specs):
 								        return False
 								    has_upper = any(sp.operator in ("<", "<=") for sp in specs)
 								    has_lower = any(sp.operator in (">", ">=") for sp in specs)
 								    # We have a version spec that defines an upper and lower bound
 								    if has_upper and has_lower:
 								        return False
 								    # Everything else, like only an upper version, only a lower version etc.
 								    return True
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_model_version_range(spacy_version: str) -> str:
-												WIP: improve model version deps

											
										
										
											2020-05-28 13:51:37 +03:00
+								    """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
 								    version. Models are always compatible across patch versions but not
 								    across minor or major versions.
 								    """
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    release = Version(spacy_version).release
 								    return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_base_version(version: str) -> str:
-												Make functions more general purpose and update docstrings and tests

											
										
										
											2020-05-30 16:18:53 +03:00
+								    """Generate the base version without any prerelease identifiers.
 								    version (str): The version, e.g. "3.0.0.dev1".
 								    RETURNS (str): The base version, e.g. "3.0.0".
 								    """
-												Use more sophisticated version parsing logic

											
										
										
											2020-05-30 16:01:58 +03:00
+								    return Version(version).base_version
-												WIP: improve model version deps

											
										
										
											2020-05-28 13:51:37 +03:00
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
 								    """Load a model meta.json from a path and validate its contents.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    path (Union[str, Path]): Path to meta.json.
 								    RETURNS (Dict[str, Any]): The loaded meta.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								    path = ensure_path(path)
 								    if not path.parent.exists():
 								        raise IOError(Errors.E052.format(path=path.parent))
 								    if not path.exists() or not path.is_file():
 								        raise IOError(Errors.E053.format(path=path, name="meta.json"))
 								    meta = srsly.read_json(path)
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    for setting in ["lang", "name", "version"]:
-												Add more validation for model meta

											
										
										
											2017-08-29 12:21:44 +03:00
+								        if setting not in meta or not meta[setting]:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E054.format(setting=setting))
-												Add rudimentary version checks on model load

											
										
										
											2020-06-02 18:23:16 +03:00
+								    if "spacy_version" in meta:
-												Add warning

											
										
										
											2020-05-30 16:34:54 +03:00
+								        if not is_compatible_version(about.__version__, meta["spacy_version"]):
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								            warn_msg = Warnings.W095.format(
 								                model=f"{meta['lang']}_{meta['name']}",
-												Add rudimentary version checks on model load

											
										
										
											2020-06-02 18:23:16 +03:00
+								                model_version=meta["version"],
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								                version=meta["spacy_version"],
-												Add rudimentary version checks on model load

											
										
										
											2020-06-02 18:23:16 +03:00
+								                current=about.__version__,
 								            )
 								            warnings.warn(warn_msg)
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								        if is_unconstrained_version(meta["spacy_version"]):
 								            warn_msg = Warnings.W094.format(
 								                model=f"{meta['lang']}_{meta['name']}",
 								                model_version=meta["version"],
 								                version=meta["spacy_version"],
 								                example=get_model_version_range(about.__version__),
-												Add warning

											
										
										
											2020-05-30 16:34:54 +03:00
+								            )
-												Add warning for loose version constraints (#5536)

* Add warning for loose version constraints

* Update wording [ci skip]

* Tweak error message

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-06-05 13:42:15 +03:00
+								            warnings.warn(warn_msg)
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    return meta
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Update docs and util consistency

											
										
										
											2020-08-18 02:22:59 +03:00
+								def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
 								    """Get model meta.json from a directory path and validate its contents.
 								    path (str / Path): Path to model directory.
 								    RETURNS (Dict[str, Any]): The model's meta data.
 								    """
 								    model_path = ensure_path(path)
 								    return load_meta(model_path / "meta.json")
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_package(name: str) -> bool:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Check if string maps to a package installed via pip.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Name of package.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    RETURNS (bool): True if installed package, False if not.
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    """
-												Simplify is_package check

											
										
										
											2020-05-24 15:48:56 +03:00
+								    try:
 								        importlib_metadata.distribution(name)
 								        return True
 								    except:  # noqa: E722
 								        return False
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_package_path(name: str) -> Path:
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """Get the path to an installed package.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Package name.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    RETURNS (Path): Path to installed package.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Always compare lowercase package names

Otherwise, is_package will return False if model name contains
uppercase characters. See this issue:
https://support.prodi.gy/t/saving-a-trained-ner-model-as-a-loadable-modu
le/46/6

											
										
										
											2017-09-29 21:55:17 +03:00
+								    name = name.lower()  # use lowercase version to be safe
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    # Here we're importing the module just to find it. This is worryingly
 								    # indirect, but it's otherwise very difficult to find the package.
-												Add tests for displaCy and util functions and fix util typo

											
										
										
											2017-05-29 11:51:19 +03:00
+								    pkg = importlib.import_module(name)
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    return Path(pkg.__file__).parent
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								def split_command(command: str) -> List[str]:
 								    """Split a string command using shlex. Handles platform compatibility.
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    command (str) : The command to split
 								    RETURNS (List[str]): The split command.
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    """
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    return shlex.split(command, posix=not is_windows)
-												Refactor project CLI (#5732)

* Make project command a submodule

* Update with WIP

* Add helper for joining commands

* Update docstrins, formatting and types

* Update assets and add support for copying local files

* Fix type

* Update success messages
											
										
										
											2020-07-09 02:42:51 +03:00
+								def join_command(command: List[str]) -> str:
 								    """Join a command using shlex. shlex.join is only available for Python 3.8+,
 								    so we're using a workaround here.
 								    command (List[str]): The command to join.
 								    RETURNS (str): The joined command
 								    """
 								    return " ".join(shlex.quote(cmd) for cmd in command)
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								def run_command(
 								    command: Union[str, List[str]],
 								    *,
 								    stdin: Optional[Any] = None,
-												Format

											
										
										
											2020-09-20 17:31:48 +03:00
+								    capture: bool = False,
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								) -> Optional[subprocess.CompletedProcess]:
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    """Run a command on the command line as a subprocess. If the subprocess
 								    returns a non-zero exit code, a system exit is performed.
 								    command (str / List[str]): The command. If provided as a string, the
 								        string will be split using shlex.split.
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								    stdin (Optional[Any]): stdin to read from or None.
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								    capture (bool): Whether to capture the output and errors. If False,
 								        the stdout and stderr will not be redirected, and if there's an error,
 								        sys.exit will be called with the returncode. You should use capture=False
 								        when you want to turn over execution to the command, and capture=True
 								        when you want to run the command more like a function.
-												Update types and docstrings

											
										
										
											2020-09-13 11:52:02 +03:00
+								    RETURNS (Optional[CompletedProcess]): The process object.
-												Make run_command take string and list

											
										
										
											2020-06-30 14:17:00 +03:00
+								    """
 								    if isinstance(command, str):
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								        cmd_list = split_command(command)
 								        cmd_str = command
 								    else:
 								        cmd_list = command
 								        cmd_str = " ".join(command)
-												add custom warning when run_command fails

											
										
										
											2020-06-30 18:28:43 +03:00
+								    try:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        ret = subprocess.run(
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            cmd_list,
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								            env=os.environ.copy(),
 								            input=stdin,
-												Fix run_command for python 3.6

											
										
										
											2020-08-26 06:02:43 +03:00
+								            encoding="utf8",
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            check=False,
-												Make run_command backwards compatible

											
										
										
											2020-08-26 05:33:42 +03:00
+								            stdout=subprocess.PIPE if capture else None,
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            stderr=subprocess.STDOUT if capture else None,
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        )
-												add custom warning when run_command fails

											
										
										
											2020-06-30 18:28:43 +03:00
+								    except FileNotFoundError:
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								        # Indicates the *command* wasn't found, it's an error before the command
 								        # is run.
-												add custom warning when run_command fails

											
										
										
											2020-06-30 18:28:43 +03:00
+								        raise FileNotFoundError(
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								        ) from None
-												Improve error handling in run_command

											
										
										
											2020-09-20 17:20:57 +03:00
+								    if ret.returncode != 0 and capture:
 								        message = f"Error running command:\n\n{cmd_str}\n\n"
 								        message += f"Subprocess exited with status {ret.returncode}"
 								        if ret.stdout is not None:
 								            message += f"\n\nProcess log (stdout and stderr):\n\n"
 								            message += ret.stdout
 								        error = subprocess.SubprocessError(message)
 								        error.ret = ret
 								        error.command = cmd_str
 								        raise error
 								    elif ret.returncode != 0:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 05:00:14 +03:00
+								        sys.exit(ret.returncode)
 								    return ret
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
 								@contextmanager
 								def working_dir(path: Union[str, Path]) -> None:
 								    """Change current working directory and returns to previous on exit.
 								    path (str / Path): The directory to navigate to.
-												Make working_dir yield absolute cwd path

											
										
										
											2020-06-30 14:17:14 +03:00
+								    YIELDS (Path): The absolute path to the current working directory. This
 								        should be used if the block needs to perform actions within the working
 								        directory, to prevent mismatches with relative paths.
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    """
 								    prev_cwd = Path.cwd()
-												Resolve within working_dir context manager

											
										
										
											2020-06-30 14:29:45 +03:00
+								    current = Path(path).resolve()
 								    os.chdir(str(current))
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    try:
-												Resolve within working_dir context manager

											
										
										
											2020-06-30 14:29:45 +03:00
+								        yield current
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
+								    finally:
-												Resolve within working_dir context manager

											
										
										
											2020-06-30 14:29:45 +03:00
+								        os.chdir(str(prev_cwd))
-												Refactor CLI

											
										
										
											2020-06-21 22:35:01 +03:00
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
+								@contextmanager
-												Allow spacy project to push and pull to/from remote storage (#5949)

* Add utils for working with remote storage

* WIP add remote_cache for project

* WIP add push and pull commands

* Use pathy in remote_cache

* Updarte util

* Update remote_cache

* Update util

* Update project assets

* Update pull script

* Update push script

* Fix type annotation in util

* Work on remote storage

* Remove site and env hash

* Fix imports

* Fix type annotation

* Require pathy

* Require pathy

* Fix import

* Add a util to handle project variable substitution

* Import push and pull commands

* Fix pull command

* Fix push command

* Fix tarfile in remote_storage

* Improve printing

* Fiddle with status messages

* Set version to v3.0.0a9

* Draft docs for spacy project remote storages

* Update docs [ci skip]

* Use Thinc config to simplify and unify template variables

* Auto-format

* Don't import Pathy globally for now

Causes slow and annoying Google Cloud warning

* Tidy up test

* Tidy up and update tests

* Update to latest Thinc

* Update docs

* variables -> vars

* Update docs [ci skip]

* Update docs [ci skip]

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-23 19:32:09 +03:00
+								def make_tempdir() -> Generator[Path, None, None]:
-												Tidy up, document and add custom clone logic

											
										
										
											2020-06-28 16:08:35 +03:00
+								    """Execute a block in a temporary directory and remove the directory and
 								    its contents at the end of the with block.
 								    YIELDS (Path): The path of the temp directory.
 								    """
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
+								    d = Path(tempfile.mkdtemp())
 								    yield d
-												throw warning (instead of crashing) when temp dir can't be cleaned

											
										
										
											2020-06-29 19:16:39 +03:00
+								    try:
 								        shutil.rmtree(str(d))
-												print details on error msg (e.g. PermissionError on specific file)

											
										
										
											2020-06-29 19:22:33 +03:00
+								    except PermissionError as e:
 								        warnings.warn(Warnings.W091.format(dir=d, msg=e))
-												Update project CLI

											
										
										
											2020-06-22 15:53:31 +03:00
-												Refactor project CLI (#5732)

* Make project command a submodule

* Update with WIP

* Add helper for joining commands

* Update docstrins, formatting and types

* Update assets and add support for copying local files

* Fix type

* Update success messages
											
										
										
											2020-07-09 02:42:51 +03:00
+								def is_cwd(path: Union[Path, str]) -> bool:
 								    """Check whether a path is the current working directory.
 								    path (Union[Path, str]): The directory path.
 								    RETURNS (bool): Whether the path is the current working directory.
 								    """
 								    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def is_in_jupyter() -> bool:
-												Update spacy.util documentation

											
										
										
											2017-05-21 02:12:09 +03:00
+								    """Check if user is running spaCy from a Jupyter notebook by detecting the
 								    IPython kernel. Mainly used for the displaCy visualizer.
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    RETURNS (bool): True if in Jupyter, False if not.
 								    """
-												Small fixes to displaCy (#3076)

## Description
- [x] fix auto-detection of Jupyter notebooks (even if `jupyter=True` isn't set)
- [x] add `displacy.set_render_wrapper` method to define a custom function called around the HTML markup generated in all calls to `displacy.render` (can be used to allow custom integrations, callbacks and page formatting)
- [x] add option to customise host for web server
- [x] show warning if `displacy.serve` is called from within Jupyter notebooks
- [x] move error message to `spacy.errors.Errors`.

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-12-20 19:32:04 +03:00
+								    # https://stackoverflow.com/a/39662359/6400719
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    try:
-												Small fixes to displaCy (#3076)

## Description
- [x] fix auto-detection of Jupyter notebooks (even if `jupyter=True` isn't set)
- [x] add `displacy.set_render_wrapper` method to define a custom function called around the HTML markup generated in all calls to `displacy.render` (can be used to allow custom integrations, callbacks and page formatting)
- [x] add option to customise host for web server
- [x] show warning if `displacy.serve` is called from within Jupyter notebooks
- [x] move error message to `spacy.errors.Errors`.

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-12-20 19:32:04 +03:00
+								        shell = get_ipython().__class__.__name__
 								        if shell == "ZMQInteractiveShell":
 								            return True  # Jupyter notebook or qtconsole
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    except NameError:
-												Small fixes to displaCy (#3076)

## Description
- [x] fix auto-detection of Jupyter notebooks (even if `jupyter=True` isn't set)
- [x] add `displacy.set_render_wrapper` method to define a custom function called around the HTML markup generated in all calls to `displacy.render` (can be used to allow custom integrations, callbacks and page formatting)
- [x] add option to customise host for web server
- [x] show warning if `displacy.serve` is called from within Jupyter notebooks
- [x] move error message to `spacy.errors.Errors`.

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-12-20 19:32:04 +03:00
+								        return False  # Probably standard Python interpreter
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								    return False
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def get_object_name(obj: Any) -> str:
 								    """Get a human-readable name of a Python object, e.g. a pipeline component.
 								    obj (Any): The Python object, typically a function or class.
 								    RETURNS (str): A human-readable name.
 								    """
 								    if hasattr(obj, "name"):
 								        return obj.name
 								    if hasattr(obj, "__name__"):
 								        return obj.__name__
 								    if hasattr(obj, "__class__") and hasattr(obj.__class__, "__name__"):
 								        return obj.__class__.__name__
 								    return repr(obj)
-												Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors

											
										
										
											2019-10-27 15:35:49 +03:00
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
+								def is_same_func(func1: Callable, func2: Callable) -> bool:
 								    """Approximately decide whether two functions are the same, even if their
 								    identity is different (e.g. after they have been live reloaded). Mostly
 								    used in the @Language.component and @Language.factory decorators to decide
 								    whether to raise if a factory already exists. Allows decorator to run
 								    multiple times with the same function.
 								    func1 (Callable): The first function.
 								    func2 (Callable): The second function.
 								    RETURNS (bool): Whether it's the same function (most likely).
 								    """
 								    if not callable(func1) or not callable(func2):
 								        return False
 								    same_name = func1.__qualname__ == func2.__qualname__
 								    same_file = inspect.getfile(func1) == inspect.getfile(func2)
 								    same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2)
-												Update spacy/util.py

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-08-29 14:17:06 +03:00
+								    return same_name and same_file and same_code
-												Allow component decorators to re-run with same function

											
										
										
											2020-08-28 17:27:22 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_cuda_stream(
 								    require: bool = False, non_blocking: bool = True
 								) -> Optional[CudaStream]:
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								    ops = get_current_ops()
-												Revert "Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop""

This reverts commit f41e626844c62c1cf1333a99df2e027ec205a7c6.

											
										
										
											2018-03-27 20:22:52 +03:00
+								    if CudaStream is None:
 								        return None
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
+								    elif isinstance(ops, NumpyOps):
-												Revert "Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop""

This reverts commit f41e626844c62c1cf1333a99df2e027ec205a7c6.

											
										
										
											2018-03-27 20:22:52 +03:00
+								        return None
 								    else:
-												Support no hidden layer in parser and NER (#4672)

* Support no hidden layers for parser

* Fix parser model for depth 1

* Fix parser for hidden depth=0

* Add option of non-blocking to CUDA stream

											
										
										
											2019-11-19 17:54:34 +03:00
+								        return CudaStream(non_blocking=non_blocking)
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 22:46:08 +03:00
 								def get_async(stream, numpy_array):
 								    if cupy is None:
 								        return numpy_array
 								    else:
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        array = cupy.ndarray(numpy_array.shape, order="C", dtype=numpy_array.dtype)
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								        array.set(numpy_array, stream=stream)
 								        return array
-												Fix formatting

											
										
										
											2017-05-26 13:37:45 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def read_regex(path: Union[str, Path]) -> Pattern:
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								    path = ensure_path(path)
-												set encodings explicitly to utf8 (#4551)


											
										
										
											2019-10-29 15:16:55 +03:00
+								    with path.open(encoding="utf8") as file_:
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								        entries = file_.read().split("\n")
 								    expression = "|".join(
 								        ["^" + re.escape(piece) for piece in entries if piece.strip()]
 								    )
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
-												Update docstrings [ci skip]

											
										
										
											2019-02-24 20:39:59 +03:00
+								    """Compile a sequence of prefix rules into a regex object.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    entries (Iterable[Union[str, Pattern]]): The prefix rules, e.g.
 								        spacy.lang.punctuation.TOKENIZER_PREFIXES.
 								    RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
+								    """
-												Remove dead and/or deprecated code (#5710)

* Remove dead and/or deprecated code

* Remove n_threads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-06 14:06:25 +03:00
+								    expression = "|".join(["^" + piece for piece in entries if piece.strip()])
 								    return re.compile(expression)
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
-												Update docstrings [ci skip]

											
										
										
											2019-02-24 20:39:59 +03:00
+								    """Compile a sequence of suffix rules into a regex object.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    entries (Iterable[Union[str, Pattern]]): The suffix rules, e.g.
 								        spacy.lang.punctuation.TOKENIZER_SUFFIXES.
 								    RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    expression = "|".join([piece + "$" for piece in entries if piece.strip()])
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
-												Update docstrings [ci skip]

											
										
										
											2019-02-24 20:39:59 +03:00
+								    """Compile a sequence of infix rules into a regex object.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    entries (Iterable[Union[str, Pattern]]): The infix rules, e.g.
 								        spacy.lang.punctuation.TOKENIZER_INFIXES.
-												Document regex utilities [ci skip]

											
										
										
											2019-02-24 20:34:10 +03:00
+								    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
 								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    expression = "|".join([piece for piece in entries if piece.strip()])
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def add_lookups(default_func: Callable[[str], Any], *lookups) -> Callable[[str], Any]:
-												Add add_lookups util function

											
										
										
											2017-06-03 20:44:47 +03:00
+								    """Extend an attribute function with special cases. If a word is in the
 								    lookups, the value is returned. Otherwise the previous function is used.
 								    default_func (callable): The default function to execute.
 								    *lookups (dict): Lookup dictionary mapping string to attribute value.
 								    RETURNS (callable): Lexical attribute getter.
 								    """
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								    # This is implemented as functools.partial instead of a closure, to allow
 								    # pickle to work.
 								    return functools.partial(_get_attr_unless_lookup, default_func, lookups)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def _get_attr_unless_lookup(
 								    default_func: Callable[[str], Any], lookups: Dict[str, Any], string: str
 								) -> Any:
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								    for lookup in lookups:
 								        if string in lookup:
 								            return lookup[string]
 								    return default_func(string)
-												Add add_lookups util function

											
										
										
											2017-06-03 20:44:47 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def update_exc(
 								    base_exceptions: Dict[str, List[dict]], *addition_dicts
 								) -> Dict[str, List[dict]]:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Update and validate tokenizer exceptions. Will overwrite exceptions.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    base_exceptions (Dict[str, List[dict]]): Base exceptions.
 								    *addition_dicts (Dict[str, List[dict]]): Exceptions to add to the base dict, in order.
 								    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    exc = dict(base_exceptions)
 								    for additions in addition_dicts:
 								        for orth, token_attrs in additions.items():
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								            if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            described_orth = "".join(attr[ORTH] for attr in token_attrs)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								            if orth != described_orth:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								        exc.update(additions)
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								    exc = expand_exc(exc, "'", "’")
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    return exc
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def expand_exc(
 								    excs: Dict[str, List[dict]], search: str, replace: str
 								) -> Dict[str, List[dict]]:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Find string in tokenizer exceptions, duplicate entry and replace string.
 								    For example, to add additional versions with typographic apostrophes.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    excs (Dict[str, List[dict]]): Tokenizer exceptions.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    search (str): String to find and replace.
 								    replace (str): Replacement.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    def _fix_token(token, search, replace):
 								        fixed = dict(token)
 								        fixed[ORTH] = fixed[ORTH].replace(search, replace)
 								        return fixed
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								    new_excs = dict(excs)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    for token_string, tokens in excs.items():
 								        if search in token_string:
 								            new_key = token_string.replace(search, replace)
 								            new_value = [_fix_token(t, search, replace) for t in tokens]
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								            new_excs[new_key] = new_value
 								    return new_excs
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def normalize_slice(
 								    length: int, start: int, stop: int, step: Optional[int] = None
 								) -> Tuple[int, int]:
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    if not (step is None or step == 1):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        raise ValueError(Errors.E057)
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    if start is None:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        start = 0
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    elif start < 0:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        start += length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    start = min(length, max(0, start))
 								    if stop is None:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        stop = length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    elif stop < 0:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        stop += length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    stop = min(length, max(start, stop))
 								    return start, stop
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
 								    creating named entities (where one token can only be part of one entity) or
 								    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
 								    longest span is preferred over shorter spans.
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    spans (Iterable[Span]): The spans to filter.
 								    RETURNS (List[Span]): The filtered spans.
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    """
-												Fix util.filter_spans() to prefer first span in overlapping sam… (#4414)

* Update util.filter_spans() to prefer earlier spans

* Add filter_spans test for first same-length span

* Update entity relation example to refer to util.filter_spans()

											
										
										
											2019-10-10 18:00:03 +03:00
+								    get_sort_key = lambda span: (span.end - span.start, -span.start)
-												Add util.filter_spans helper (#3686)


											
										
										
											2019-05-08 03:33:40 +03:00
+								    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
 								    result = []
 								    seen_tokens = set()
 								    for span in sorted_spans:
 								        # Check for end - 1 here because boundaries are inclusive
 								        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
 								            result.append(span)
 								        seen_tokens.update(range(span.start, span.end))
 								    result = sorted(result, key=lambda span: span.start)
 								    return result
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def to_bytes(getters: Dict[str, Callable[[], bytes]], exclude: Iterable[str]) -> bytes:
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								    return srsly.msgpack_dumps(to_dict(getters, exclude))
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def from_bytes(
 								    bytes_data: bytes,
 								    setters: Dict[str, Callable[[bytes], Any]],
 								    exclude: Iterable[str],
 								) -> None:
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def to_dict(
 								    getters: Dict[str, Callable[[], Any]], exclude: Iterable[str]
 								) -> Dict[str, Any]:
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    serialized = {}
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								    for key, getter in getters.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								            serialized[key] = getter()
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								    return serialized
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def from_dict(
 								    msg: Dict[str, Any],
 								    setters: Dict[str, Callable[[Any], Any]],
 								    exclude: Iterable[str],
 								) -> Dict[str, Any]:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								    for key, setter in setters.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude and key in msg:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								            setter(msg[key])
 								    return msg
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def to_disk(
 								    path: Union[str, Path],
 								    writers: Dict[str, Callable[[Path], None]],
 								    exclude: Iterable[str],
 								) -> Path:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    path = ensure_path(path)
 								    if not path.exists():
 								        path.mkdir()
 								    for key, writer in writers.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								            writer(path / key)
 								    return path
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def from_disk(
 								    path: Union[str, Path],
 								    readers: Dict[str, Callable[[Path], None]],
 								    exclude: Iterable[str],
 								) -> Path:
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    path = ensure_path(path)
 								    for key, reader in readers.items():
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        # Split to support file names like meta.json
 								        if key.split(".")[0] not in exclude:
-												Fix deserialization of vectors

											
										
										
											2017-10-16 21:55:00 +03:00
+								            reader(path / key)
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    return path
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    """Import module from a file. Used to load models from a directory.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    name (str): Name of module to load.
 								    loc (str / Path): Path to the file.
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    RETURNS: The loaded module.
 								    """
 								    loc = str(loc)
 								    spec = importlib.util.spec_from_file_location(name, str(loc))
 								    module = importlib.util.module_from_spec(spec)
 								    spec.loader.exec_module(module)
 								    return module
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def minify_html(html: str) -> str:
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
+								    """Perform a template-specific, rudimentary HTML minification for displaCy.
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    Disclaimer: NOT a general-purpose solution, only removes indentation and
 								    newlines.
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    html (str): Markup to minify.
 								    RETURNS (str): "Minified" HTML.
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    return html.strip().replace("    ", "").replace("\n", "")
-												Add util function to enable GPU

											
										
										
											2017-09-21 03:16:35 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def escape_html(text: str) -> str:
-												escape html in displacy.render (#2378) (closes #2361)

## Description
Fix for issue #2361 :
replace &, <, >, " with &amp;amp; , &amp;lt; , &amp;gt; , &amp;quot; in before rendering svg

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
(As discussed in the comments to #2361)
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-05-28 19:36:41 +03:00
+								    """Replace <, >, &, " with their HTML encoded representation. Intended to
 								    prevent HTML errors in rendered displaCy markup.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								    text (str): The original text.
 								    RETURNS (str): Equivalent text to be safely used within HTML.
-												escape html in displacy.render (#2378) (closes #2361)

## Description
Fix for issue #2361 :
replace &, <, >, " with &amp;amp; , &amp;lt; , &amp;gt; , &amp;quot; in before rendering svg

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
(As discussed in the comments to #2361)
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-05-28 19:36:41 +03:00
+								    """
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    text = text.replace("&", "&amp;")
 								    text = text.replace("<", "&lt;")
 								    text = text.replace(">", "&gt;")
 								    text = text.replace('"', "&quot;")
-												escape html in displacy.render (#2378) (closes #2361)

## Description
Fix for issue #2361 :
replace &, <, >, " with &amp;amp; , &amp;lt; , &amp;gt; , &amp;quot; in before rendering svg

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
(As discussed in the comments to #2361)
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-05-28 19:36:41 +03:00
+								    return text
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def get_words_and_spaces(
 								    words: Iterable[str], text: str
 								) -> Tuple[List[str], List[bool]]:
-												Update docs, types and API consistency

											
										
										
											2020-08-17 17:45:24 +03:00
+								    """Given a list of words and a text, reconstruct the original tokens and
 								    return a list of words and spaces that can be used to create a Doc. This
 								    can help recover destructive tokenization that didn't preserve any
 								    whitespace information.
 								    words (Iterable[str]): The words.
 								    text (str): The original text.
 								    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
 								    """
-												Improve GoldParse NER alignment (#5335)

Improve GoldParse NER alignment by including all cases where the start
and end of the NER span can be aligned, regardless of internal
tokenization differences.

To do this, convert BILUO tags to character offsets, check start/end
alignment with `doc.char_span()`, and assign the BILUO tags for the
aligned spans. Alignment for `O/-` tags is handled through the
one-to-one and multi alignments.
											
										
										
											2020-04-23 17:58:23 +03:00
+								    if "".join("".join(words).split()) != "".join(text.split()):
-												Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
											
										
										
											2020-04-14 20:15:52 +03:00
+								        raise ValueError(Errors.E194.format(text=text, words=words))
 								    text_words = []
 								    text_spaces = []
 								    text_pos = 0
 								    # normalize words to remove all whitespace tokens
 								    norm_words = [word for word in words if not word.isspace()]
 								    # align words with text
 								    for word in norm_words:
 								        try:
 								            word_start = text[text_pos:].index(word)
 								        except ValueError:
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								            raise ValueError(Errors.E194.format(text=text, words=words)) from None
-												Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
											
										
										
											2020-04-14 20:15:52 +03:00
+								        if word_start > 0:
-												Tidy up and auto-format

											
										
										
											2020-05-21 15:14:01 +03:00
+								            text_words.append(text[text_pos : text_pos + word_start])
-												Add Doc init from list of words and text (#5251)

* Add Doc init from list of words and text

Add an option to initialize a `Doc` from a text and list of words where
the words may or may not include all whitespace tokens. If the text and
words are mismatched, raise an error.

* Fix error code

* Remove all whitespace before aligning words/text

* Move words/text init to util function

* Update error message

* Rename to get_words_and_spaces

* Fix formatting
											
										
										
											2020-04-14 20:15:52 +03:00
+								            text_spaces.append(False)
 								            text_pos += word_start
 								        text_words.append(word)
 								        text_spaces.append(False)
 								        text_pos += len(word)
 								        if text_pos < len(text) and text[text_pos] == " ":
 								            text_spaces[-1] = True
 								            text_pos += 1
 								    if text_pos < len(text):
 								        text_words.append(text[text_pos:])
 								        text_spaces.append(False)
 								    return (text_words, text_spaces)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
 								    """Deep copy a Config. Will raise an error if the config contents are not
 								    JSON-serializable.
 								    config (Config): The config to copy.
 								    RETURNS (Config): The copied config.
-												Add SimpleFrozenDict util to use as default function argument

											
										
										
											2018-05-20 16:13:37 +03:00
+								    """
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    try:
 								        return Config(config).copy()
 								    except ValueError:
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								        raise ValueError(Errors.E961.format(config=config)) from None
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
-												Add SimpleFrozenDict util to use as default function argument

											
										
										
											2018-05-20 16:13:37 +03:00
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
 								    """Convert dot notation to a dict. For example: {"token.pos": True,
 								    "token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
 								    values (Dict[str, Any]): The key/value pairs to convert.
 								    RETURNS (Dict[str, dict]): The converted values.
 								    """
 								    result = {}
 								    for key, value in values.items():
 								        path = result
 								        parts = key.lower().split(".")
 								        for i, item in enumerate(parts):
 								            is_last = i == len(parts) - 1
 								            path = path.setdefault(item, value if is_last else {})
 								    return result
 								def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
 								    """Convert dot notation to a dict. For example: {"token": {"pos": True,
 								    "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
 								    values (Dict[str, dict]): The dict to convert.
 								    RETURNS (Dict[str, Any]): The key/value pairs.
 								    """
 								    return {".".join(key): value for key, value in walk_dict(obj)}
-												util function dot_to_object and corresponding unit test

											
										
										
											2020-07-27 18:50:12 +03:00
+								def dot_to_object(config: Config, section: str):
 								    """Convert dot notation of a "section" to a specific part of the Config.
 								    e.g. "training.optimizer" would return the Optimizer object.
 								    Throws an error if the section is not defined in this config.
 								    config (Config): The config.
 								    section (str): The dot notation of the section in the config.
 								    RETURNS: The object denoted by the section
 								    """
 								    component = config
 								    parts = section.split(".")
 								    for item in parts:
 								        try:
 								            component = component[item]
-												Move error to Errors

											
										
										
											2020-07-28 17:24:14 +03:00
+								        except (KeyError, TypeError):
-												Use "raise ... from" in custom errors for better tracebacks

											
										
										
											2020-08-06 00:53:21 +03:00
+								            raise KeyError(Errors.E952.format(name=section)) from None
-												util function dot_to_object and corresponding unit test

											
										
										
											2020-07-27 18:50:12 +03:00
+								    return component
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								def walk_dict(
 								    node: Dict[str, Any], parent: List[str] = []
 								) -> Iterator[Tuple[List[str], Any]]:
 								    """Walk a dict and yield the path and values of the leaves."""
 								    for key, value in node.items():
 								        key_parent = [*parent, key]
 								        if isinstance(value, dict):
 								            yield from walk_dict(value, key_parent)
 								        else:
 								            yield (key_parent, value)
 								def get_arg_names(func: Callable) -> List[str]:
 								    """Get a list of all named arguments of a function (regular,
 								    keyword-only).
 								    func (Callable): The function
 								    RETURNS (List[str]): The argument names.
 								    """
 								    argspec = inspect.getfullargspec(func)
 								    return list(set([*argspec.args, *argspec.kwonlyargs]))
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
-												Fix combined scores and update test

											
										
										
											2020-09-24 11:42:47 +03:00
+								def combine_score_weights(
 								    weights: List[Dict[str, float]],
 								    overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
 								) -> Dict[str, float]:
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    """Combine and normalize score weights defined by components, e.g.
 								    {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
 								    weights (List[dict]): The weights defined by the components.
-												Fix combined scores and update test

											
										
										
											2020-09-24 11:42:47 +03:00
+								    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
 								        should be preserved.
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    RETURNS (Dict[str, float]): The combined and normalized weights.
 								    """
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								    # We first need to extract all None/null values for score weights that
 								    # shouldn't be shown in the table *or* be weighted
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    result = {}
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								    all_weights = []
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    for w_dict in weights:
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								        filtered_weights = {}
 								        for key, value in w_dict.items():
-												Fix combined scores and update test

											
										
										
											2020-09-24 11:42:47 +03:00
+								            value = overrides.get(key, value)
-												Fix handling of score_weights

											
										
										
											2020-09-24 11:27:33 +03:00
+								            if value is None:
 								                result[key] = None
 								            else:
 								                filtered_weights[key] = value
 								        all_weights.append(filtered_weights)
 								    for w_dict in all_weights:
-												Add and update score methods and score weights

Add and update `score` methods, provided `scores`, and default weights
`default_score_weights` for pipeline components.

* `scores` provides all top-level keys returned by `score` (merely informative, similar to `assigns`).
* `default_score_weights` provides the default weights for a default config.
* The keys from `default_score_weights` determine which values will be
shown in the `spacy train` output, so keys with weight `0.0` will be
displayed but not counted toward the overall score.

											
										
										
											2020-07-27 13:27:40 +03:00
+								        # We need to account for weights that don't sum to 1.0 and normalize
 								        # the score weights accordingly, then divide score by the number of
 								        # components.
 								        total = sum(w_dict.values())
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								        for key, value in w_dict.items():
-												Prevent division by zero in score weights

											
										
										
											2020-09-24 17:42:13 +03:00
+								            if total == 0:
 								                weight = 0.0
 								            else:
 								                weight = round(value / total / len(all_weights), 2)
-												Fix combined scores if multiple components report it

											
										
										
											2020-09-24 18:11:13 +03:00
+								            prev_weight = result.get(key, 0.0)
 								            prev_weight = 0.0 if prev_weight is None else prev_weight
 								            result[key] = prev_weight + weight
-												Allow pipeline components to set default scores and weights

											
										
										
											2020-07-26 14:18:43 +03:00
+								    return result
-												Remove object subclassing

											
										
										
											2020-07-12 15:03:23 +03:00
+								class DummyTokenizer:
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
 								    # allow serialization (see #1557)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_bytes(self, **kwargs):
-												Auto-format

											
										
										
											2019-02-07 23:00:04 +03:00
+								        return b""
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_bytes(self, _bytes_data, **kwargs):
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								        return self
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_disk(self, _path, **kwargs):
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								        return None
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_disk(self, _path, **kwargs):
-												Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078)


											
										
										
											2019-01-10 17:40:37 +03:00
+								        return self
-												Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function

* Add train_from_config script

* Merge configs and expose via spacy.config

* Fix script

* Suggest create_evaluation_callback

* Hard-code for NER

* Fix errors

* Register command

* Add TODO

* Update train-from-config todos

* Fix imports

* Allow delayed setting of parser model nr_class

* Get train-from-config working

* Tidy up and fix scores and printing

* Hide traceback if cancelled

* Fix weighted score formatting

* Fix score formatting

* Make output_path optional

* Add Tok2Vec component

* Tidy up and add tok2vec_tensors

* Add option to copy docs in nlp.update

* Copy docs in nlp.update

* Adjust nlp.update() for set_annotations

* Don't shuffle pipes in nlp.update, decruft

* Support set_annotations arg in component update

* Support set_annotations in parser update

* Add get_gradients method

* Add get_gradients to parser

* Update errors.py

* Fix problems caused by merge

* Add _link_components method in nlp

* Add concept of 'listeners' and ControlledModel

* Support optional attributes arg in ControlledModel

* Try having tok2vec component in pipeline

* Fix tok2vec component

* Fix config

* Fix tok2vec

* Update for Example

* Update for Example

* Update config

* Add eg2doc util

* Update and add schemas/types

* Update schemas

* Fix nlp.update

* Fix tagger

* Remove hacks from train-from-config

* Remove hard-coded config str

* Calculate loss in tok2vec component

* Tidy up and use function signatures instead of models

* Support union types for registry models

* Minor cleaning in Language.update

* Make ControlledModel specifically Tok2VecListener

* Fix train_from_config

* Fix tok2vec

* Tidy up

* Add function for bilstm tok2vec

* Fix type

* Fix syntax

* Fix pytorch optimizer

* Add example configs

* Update for thinc describe changes

* Update for Thinc changes

* Update for dropout/sgd changes

* Update for dropout/sgd changes

* Unhack gradient update

* Work on refactoring _ml

* Remove _ml.py module

* WIP upgrade cli scripts for thinc

* Move some _ml stuff to util

* Import link_vectors from util

* Update train_from_config

* Import from util

* Import from util

* Temporarily add ml.component_models module

* Move ml methods

* Move typedefs

* Update load vectors

* Update gitignore

* Move imports

* Add PrecomputableAffine

* Fix imports

* Fix imports

* Fix imports

* Fix missing imports

* Update CLI scripts

* Update spacy.language

* Add stubs for building the models

* Update model definition

* Update create_default_optimizer

* Fix import

* Fix comment

* Update imports in tests

* Update imports in spacy.cli

* Fix import

* fix obsolete thinc imports

* update srsly pin

* from thinc to ml_datasets for example data such as imdb

* update ml_datasets pin

* using STATE.vectors

* small fix

* fix Sentencizer.pipe

* black formatting

* rename Affine to Linear as in thinc

* set validate explicitely to True

* rename with_square_sequences to with_list2padded

* rename with_flatten to with_list2array

* chaining layernorm

* small fixes

* revert Optimizer import

* build_nel_encoder with new thinc style

* fixes using model's get and set methods

* Tok2Vec in component models, various fixes

* fix up legacy tok2vec code

* add model initialize calls

* add in build_tagger_model

* small fixes

* setting model dims

* fixes for ParserModel

* various small fixes

* initialize thinc Models

* fixes

* consistent naming of window_size

* fixes, removing set_dropout

* work around Iterable issue

* remove legacy tok2vec

* util fix

* fix forward function of tok2vec listener

* more fixes

* trying to fix PrecomputableAffine (not succesful yet)

* alloc instead of allocate

* add morphologizer

* rename residual

* rename fixes

* Fix predict function

* Update parser and parser model

* fixing few more tests

* Fix precomputable affine

* Update component model

* Update parser model

* Move backprop padding to own function, for test

* Update test

* Fix p. affine

* Update NEL

* build_bow_text_classifier and extract_ngrams

* Fix parser init

* Fix test add label

* add build_simple_cnn_text_classifier

* Fix parser init

* Set gpu off by default in example

* Fix tok2vec listener

* Fix parser model

* Small fixes

* small fix for PyTorchLSTM parameters

* revert my_compounding hack (iterable fixed now)

* fix biLSTM

* Fix uniqued

* PyTorchRNNWrapper fix

* small fixes

* use helper function to calculate cosine loss

* small fixes for build_simple_cnn_text_classifier

* putting dropout default at 0.0 to ensure the layer gets built

* using thinc util's set_dropout_rate

* moving layer normalization inside of maxout definition to optimize dropout

* temp debugging in NEL

* fixed NEL model by using init defaults !

* fixing after set_dropout_rate refactor

* proper fix

* fix test_update_doc after refactoring optimizers in thinc

* Add CharacterEmbed layer

* Construct tagger Model

* Add missing import

* Remove unused stuff

* Work on textcat

* fix test (again :)) after optimizer refactor

* fixes to allow reading Tagger from_disk without overwriting dimensions

* don't build the tok2vec prematuraly

* fix CharachterEmbed init

* CharacterEmbed fixes

* Fix CharacterEmbed architecture

* fix imports

* renames from latest thinc update

* one more rename

* add initialize calls where appropriate

* fix parser initialization

* Update Thinc version

* Fix errors, auto-format and tidy up imports

* Fix validation

* fix if bias is cupy array

* revert for now

* ensure it's a numpy array before running bp in ParserStepModel

* no reason to call require_gpu twice

* use CupyOps.to_numpy instead of cupy directly

* fix initialize of ParserModel

* remove unnecessary import

* fixes for CosineDistance

* fix device renaming

* use refactored loss functions (Thinc PR 251)

* overfitting test for tagger

* experimental settings for the tagger: avoid zero-init and subword normalization

* clean up tagger overfitting test

* use previous default value for nP

* remove toy config

* bringing layernorm back (had a bug - fixed in thinc)

* revert setting nP explicitly

* remove setting default in constructor

* restore values as they used to be

* add overfitting test for NER

* add overfitting test for dep parser

* add overfitting test for textcat

* fixing init for linear (previously affine)

* larger eps window for textcat

* ensure doc is not None

* Require newer thinc

* Make float check vaguer

* Slop the textcat overfit test more

* Fix textcat test

* Fix exclusive classes for textcat

* fix after renaming of alloc methods

* fixing renames and mandatory arguments (staticvectors WIP)

* upgrade to thinc==8.0.0.dev3

* refer to vocab.vectors directly instead of its name

* rename alpha to learn_rate

* adding hashembed and staticvectors dropout

* upgrade to thinc 8.0.0.dev4

* add name back to avoid warning W020

* thinc dev4

* update srsly

* using thinc 8.0.0a0 !

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>

											
										
										
											2020-01-29 19:06:46 +03:00
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								def create_default_optimizer() -> Optimizer:
-												Remove env_opt and simplfy default Optimizer

											
										
										
											2020-08-14 15:59:54 +03:00
+								    return Adam()
-												Create corpus iterator and batcher from registry during training (#5865)

* Move batchers into their own module (and registry)

* Update CLI

* Update Corpus and batcher

* Update tests

* Update one config

* Merge 'evaluation' block back under [training]

* Import batchers in gold __init__

* Fix batchers

* Update config

* Update schema

* Update util

* Don't assume train and dev are actually paths

* Update onto-joint config

* Fix missing import

* Format

* Format

* Update spacy/gold/corpus.py

Co-authored-by: Ines Montani <ines@ines.io>

* Fix name

* Update default config

* Fix get_length option in batchers

* Update test

* Add comment

* Pass path into Corpus

* Update docstring

* Update schema and configs

* Update config

* Fix test

* Fix paths

* Fix print

* Fix create_train_batches

* [training.read_train] -> [training.train_corpus]

* Update onto-joint config

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-08-04 16:09:37 +03:00
 								def minibatch(items, size):
 								    """Iterate over batches of items. `size` may be an iterator,
 								    so that batch-size can vary on each step.
 								    """
 								    if isinstance(size, int):
 								        size_ = itertools.repeat(size)
 								    else:
 								        size_ = size
 								    items = iter(items)
 								    while True:
 								        batch_size = next(size_)
 								        batch = list(itertools.islice(items, int(batch_size)))
 								        if len(batch) == 0:
 								            break
 								        yield list(batch)