Merge branch 'develop' into nightly.spacy.io

2025-07-15 10:42:34 +03:00 · 2020-07-09 11:43:57 +02:00 · 2020-07-09 11:43:57 +02:00 · 028f8210e8
commit 028f8210e8
parent 9ae4040183 8f9552d9e7
94 changed files with 2453 additions and 2536 deletions
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@ -9,27 +9,28 @@ max_length = 5000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-dropout = 0.2
+dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
-patience = 1600
+patience = 100000
 max_epochs = 0
-max_steps = 20000
+max_steps = 0
-eval_frequency = 500
+eval_frequency = 1000
 # Other settings
 seed = 0
-accumulate_gradient = 1
+accumulate_gradient = 2
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-discard_oversize = false
+discard_oversize = true
 omit_extra_lookups = false
 batch_by_words = true
 [training.batch_size]
@schedules = "compounding.v1"
-start = 100
+start = 1000
 stop = 1000
 compound = 1.001
@ -37,18 +38,18 @@ compound = 1.001
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
-L2_is_weight_decay = false
+L2_is_weight_decay = true
-L2 = 1e-6
+L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001
-#[optimizer.learn_rate]
+#[training.optimizer.learn_rate]
 #@schedules = "warmup_linear.v1"
-#warmup_steps = 250
+#warmup_steps = 1000
-#total_steps = 20000
+#total_steps = 50000
-#initial_rate = 0.001
+#initial_rate = 0.003
 [nlp]
 lang = "en"
@ -58,8 +59,6 @@ vectors = null
 factory = "ner"
 learn_tokens = false
 min_action_freq = 1
 beam_width = 1
 beam_update_prob = 1.0
 [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
@ -75,6 +74,6 @@ width = 96
 depth = 4
 window_size = 1
 embed_size = 2000
-maxout_pieces = 3
+maxout_pieces = 1
 subword_features = true
 dropout = ${training:dropout}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,6 +7,7 @@ requires = [
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
    "thinc>=8.0.0a12,<8.0.0a20",
-    "blis>=0.4.0,<0.5.0"
+    "blis>=0.4.0,<0.5.0",
    "pytokenizations"
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.3.0,<2.0.0
 pytokenizations
 # Official Python utilities
 setuptools
 packaging
--- a/setup.cfg
+++ b/setup.cfg
@ -51,6 +51,7 @@ install_requires =
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
    pydantic>=1.3.0,<2.0.0
    pytokenizations
    # Official Python utilities
    setuptools
    packaging
--- a/setup.py
+++ b/setup.py
@ -1,11 +1,11 @@
 #!/usr/bin/env python
 from setuptools import Extension, setup, find_packages
 import sys
 import platform
 from distutils.command.build_ext import build_ext
 from distutils.sysconfig import get_python_inc
 import distutils.util
 from distutils import ccompiler, msvccompiler
 from setuptools import Extension, setup, find_packages
 import numpy
 from pathlib import Path
 import shutil
@ -23,7 +23,6 @@ Options.docstrings = True
 PACKAGES = find_packages()
 MOD_NAMES = [
    "spacy.gold.align",
    "spacy.gold.example",
    "spacy.parts_of_speech",
    "spacy.strings",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,8 +1,7 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a1"
+__version__ = "3.0.0a2"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
 __projects__ = "https://github.com/explosion/spacy-boilerplates"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,8 +15,10 @@ from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
-from .project import project_clone, project_assets, project_run  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
-from .project import project_run_all  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
 from .project.run import project_run  # noqa: F401
 from .project.dvc import project_update_dvc  # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_app.py
+++ b/spacy/cli/_app.py
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
 DOCS: https://spacy.io/api/cli
 """
 PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
 project templates. You'd typically start by cloning a project template to a local
 directory and fetching its assets like datasets etc. See the project's
 project.yml for the available commands.
 """
 app = typer.Typer(name=NAME, help=HELP)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 app.add_typer(project_cli)
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,4 +1,4 @@
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence
 import requests
 import sys
 from wasabi import msg
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
 from .. import about
 from ..util import is_package, get_base_version, run_command
 # These are the old shortcuts we previously supported in spacy download. As of
 # v3, shortcuts are deprecated so we're not expecting to add anything to this
 # list. It only exists to show users warnings.
 OLD_SHORTCUTS = {
    "en": "en_core_web_sm",
    "de": "de_core_news_sm",
    "es": "es_core_news_sm",
    "pt": "pt_core_news_sm",
    "fr": "fr_core_news_sm",
    "it": "it_core_news_sm",
    "nl": "nl_core_news_sm",
    "el": "el_core_news_sm",
    "nb": "nb_core_news_sm",
    "lt": "lt_core_news_sm",
    "xx": "xx_ent_wiki_sm",
 }
@app.command(
    "download",
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
        version = components[-1]
        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    else:
-        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
+        model_name = model
-        model_name = shortcuts.get(model, model)
+        if model in OLD_SHORTCUTS:
            msg.warn(
                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
                f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
            )
            model_name = OLD_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
    )
-def get_json(url: str, desc: str) -> Union[dict, list]:
+def get_compatibility() -> dict:
-    r = requests.get(url)
+    version = get_base_version(about.__version__)
    r = requests.get(about.__compatibility__)
    if r.status_code != 200:
        msg.fail(
            f"Server error ({r.status_code})",
-            f"Couldn't fetch {desc}. Please find a model for your spaCy "
+            f"Couldn't fetch compatibility table. Please find a model for your spaCy "
            f"installation (v{about.__version__}), and download it manually. "
            f"For more details, see the documentation: "
            f"https://spacy.io/usage/models",
            exits=1,
        )
-    return r.json()
+    comp_table = r.json()
 def get_compatibility() -> dict:
    version = get_base_version(about.__version__)
    comp_table = get_json(about.__compatibility__, "compatibility table")
    comp = comp_table["spacy"]
    if version not in comp:
        msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@ -1,708 +0,0 @@
 from typing import List, Dict, Any, Optional, Sequence
 import typer
 import srsly
 from pathlib import Path
 from wasabi import msg
 import subprocess
 import os
 import re
 import shutil
 import sys
 import requests
 import tqdm
 from ._app import app, Arg, Opt, COMMAND, NAME
 from .. import about
 from ..schemas import ProjectConfigSchema, validate
 from ..util import ensure_path, run_command, make_tempdir, working_dir
 from ..util import get_hash, get_checksum, split_command
 CONFIG_FILE = "project.yml"
 DVC_CONFIG = "dvc.yaml"
 DVC_DIR = ".dvc"
 DIRS = [
    "assets",
    "metas",
    "configs",
    "packages",
    "metrics",
    "scripts",
    "notebooks",
    "training",
    "corpus",
 ]
 CACHES = [
    Path.home() / ".torch",
    Path.home() / ".caches" / "torch",
    os.environ.get("TORCH_HOME"),
    Path.home() / ".keras",
 ]
 DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
 # it directly and edit the project.yml instead and re-run the project."""
 CLI_HELP = f"""Command-line interface for spaCy projects and working with project
 templates. You'd typically start by cloning a project template to a local
 directory and fetching its assets like datasets etc. See the project's
 {CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
 Version Control) to manage input and output files and to ensure steps are only
 re-run if their inputs change.
 """
 project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
@project_cli.callback(invoke_without_command=True)
 def callback(ctx: typer.Context):
    """This runs before every project command and ensures DVC is installed."""
    ensure_dvc()
 ################
 # CLI COMMANDS #
 ################
@project_cli.command("clone")
 def project_clone_cli(
    # fmt: off
    name: str = Arg(..., help="The name of the template to fetch"),
    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
    only download the files from the given subdirectory. The GitHub repo
    defaults to the official spaCy template repo, but can be customized
    (including using a private repo). Setting the --git flag will also
    initialize the project directory as a Git repo. If the project is intended
    to be a Git repo, it should be initialized with Git first, before
    initializing DVC (Data Version Control). This allows DVC to integrate with
    Git.
    """
    if dest == Path.cwd():
        dest = dest / name
    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
@project_cli.command("init")
 def project_init_cli(
    # fmt: off
    path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
    force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
    # fmt: on
 ):
    """Initialize a project directory with DVC and optionally Git. This should
    typically be taken care of automatically when you run the "project clone"
    command, but you can also run it separately. If the project is intended to
    be a Git repo, it should be initialized with Git first, before initializing
    DVC. This allows DVC to integrate with Git.
    """
    project_init(path, git=git, force=force, silent=True)
@project_cli.command("assets")
 def project_assets_cli(
    # fmt: off
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Use DVC (Data Version Control) to fetch project assets. Assets are
    defined in the "assets" section of the project config. If possible, DVC
    will try to track the files so you can pull changes from upstream. It will
    also try and store the checksum so the assets are versioned. If the file
    can't be tracked or checked, it will be downloaded without DVC. If a checksum
    is provided in the project config, the file is only downloaded if no local
    file with the same checksum exists.
    """
    project_assets(project_dir)
@project_cli.command(
    "run-all",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def project_run_all_cli(
    # fmt: off
    ctx: typer.Context,
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
    """Run all commands defined in the project. This command will use DVC and
    the defined outputs and dependencies in the project config to determine
    which steps need to be re-run and where to start. This means you're only
    re-generating data if the inputs have changed.
    This command calls into "dvc repro" and all additional arguments are passed
    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
    """
    if show_help:
        print_run_help(project_dir)
    else:
        project_run_all(project_dir, *ctx.args)
@project_cli.command(
    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def project_run_cli(
    # fmt: off
    ctx: typer.Context,
    subcommand: str = Arg(None, help="Name of command defined in project config"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
    """Run a named script defined in the project config. If the command is
    part of the default pipeline defined in the "run" section, DVC is used to
    determine whether the step should re-run if its inputs have changed, or
    whether everything is up to date. If the script is not part of the default
    pipeline, it will be called separately without DVC.
    If DVC is used, the command calls into "dvc repro" and all additional
    arguments are passed to the "dvc repro" command:
    https://dvc.org/doc/command-reference/repro
    """
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
    else:
        project_run(project_dir, subcommand, *ctx.args)
@project_cli.command("exec", hidden=True)
 def project_exec_cli(
    # fmt: off
    subcommand: str = Arg(..., help="Name of command defined in project config"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Execute a command defined in the project config. This CLI command is
    only called internally in auto-generated DVC pipelines, as a shortcut for
    multi-step commands in the project config. You typically shouldn't have to
    call it yourself. To run a command, call "run" or "run-all".
    """
    project_exec(project_dir, subcommand)
@project_cli.command("update-dvc")
 def project_update_dvc_cli(
    # fmt: off
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
    """Update the auto-generated DVC config file. Uses the steps defined in the
    "run" section of the project config. This typically happens automatically
    when running a command, but can also be triggered manually if needed.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
    if updated:
        msg.good(f"Updated DVC config from {CONFIG_FILE}")
    else:
        msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
 app.add_typer(project_cli, name="project")
 #################
 # CLI FUNCTIONS #
 #################
 def project_clone(
    name: str,
    dest: Path,
    *,
    repo: str = about.__projects__,
    git: bool = False,
    no_init: bool = False,
 ) -> None:
    """Clone a project template from a repository.
    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
    git (bool): Initialize project as Git repo. Should be set to True if project
        is intended as a repo, since it will allow DVC to integrate with Git.
    no_init (bool): Don't initialize DVC and Git automatically. If True, the
        "init" command or "git init" and "dvc init" need to be run manually.
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
        try:
            run_command(cmd)
        except SystemExit:
            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
            msg.fail(err)
        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
            f.write(name)
        try:
            run_command(["git", "-C", str(tmp_dir), "fetch"])
            run_command(["git", "-C", str(tmp_dir), "checkout"])
        except SystemExit:
            err = f"Could not clone '{name}' in the repo '{repo}'."
            msg.fail(err)
        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
    for sub_dir in DIRS:
        dir_path = project_dir / sub_dir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    if not no_init:
        project_init(project_dir, git=git, force=True, silent=True)
    msg.good(f"Your project is now ready!", dest)
    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
 def project_init(
    project_dir: Path,
    *,
    git: bool = False,
    force: bool = False,
    silent: bool = False,
    analytics: bool = False,
 ):
    """Initialize a project as a DVC and (optionally) as a Git repo.
    project_dir (Path): Path to project directory.
    git (bool): Also call "git init" to initialize directory as a Git repo.
    silent (bool): Don't print any output (via DVC).
    analytics (bool): Opt-in to DVC analytics (defaults to False).
    """
    with working_dir(project_dir) as cwd:
        if git:
            run_command(["git", "init"])
        init_cmd = ["dvc", "init"]
        if silent:
            init_cmd.append("--quiet")
        if not git:
            init_cmd.append("--no-scm")
        if force:
            init_cmd.append("--force")
        run_command(init_cmd)
        # We don't want to have analytics on by default – our users should
        # opt-in explicitly. If they want it, they can always enable it.
        if not analytics:
            run_command(["dvc", "config", "core.analytics", "false"])
        # Remove unused and confusing plot templates from .dvc directory
        # TODO: maybe we shouldn't do this, but it's otherwise super confusing
        # once you commit your changes via Git and it creates a bunch of files
        # that have no purpose
        plots_dir = cwd / DVC_DIR / "plots"
        if plots_dir.exists():
            shutil.rmtree(str(plots_dir))
        config = load_project_config(cwd)
        setup_check_dvc(cwd, config)
 def project_assets(project_dir: Path) -> None:
    """Fetch assets for a project using DVC if possible.
    project_dir (Path): Path to project directory.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path)
    setup_check_dvc(project_path, config)
    assets = config.get("assets", {})
    if not assets:
        msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    variables = config.get("variables", {})
    fetched_assets = []
    for asset in assets:
        url = asset["url"].format(**variables)
        dest = asset["dest"].format(**variables)
        fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
        if fetched_path:
            fetched_assets.append(str(fetched_path))
    if fetched_assets:
        with working_dir(project_path):
            run_command(["dvc", "add", *fetched_assets, "--external"])
 def fetch_asset(
    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
 ) -> Optional[Path]:
    """Fetch an asset from a given URL or path. Will try to import the file
    using DVC's import-url if possible (fully tracked and versioned) and falls
    back to get-url (versioned) and a non-DVC download if necessary. If a
    checksum is provided and a local file exists, it's only re-downloaded if the
    checksum doesn't match.
    project_path (Path): Path to project directory.
    url (str): URL or path to asset.
    checksum (Optional[str]): Optional expected checksum of local file.
    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
        the asset failed.
    """
    url = convert_asset_url(url)
    dest_path = (project_path / dest).resolve()
    if dest_path.exists() and checksum:
        # If there's already a file, check for checksum
        # TODO: add support for caches (dvc import-url with local path)
        if checksum == get_checksum(dest_path):
            msg.good(f"Skipping download with matching checksum: {dest}")
            return dest_path
    with working_dir(project_path):
        try:
            # If these fail, we don't want to output an error or info message.
            # Try with tracking the source first, then just downloading with
            # DVC, then a regular non-DVC download.
            try:
                dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
            except subprocess.CalledProcessError:
                dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
        except subprocess.CalledProcessError:
            try:
                download_file(url, dest_path)
            except requests.exceptions.HTTPError as e:
                msg.fail(f"Download failed: {dest}", e)
                return None
    if checksum and checksum != get_checksum(dest_path):
        msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
    msg.good(f"Fetched asset {dest}")
    return dest_path
 def project_run_all(project_dir: Path, *dvc_args) -> None:
    """Run all commands defined in the project using DVC.
    project_dir (Path): Path to project directory.
    *dvc_args: Other arguments passed to "dvc repro".
    """
    config = load_project_config(project_dir)
    setup_check_dvc(project_dir, config)
    dvc_cmd = ["dvc", "repro", *dvc_args]
    with working_dir(project_dir):
        run_command(dvc_cmd)
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
    """Simulate a CLI help prompt using the info available in the project config.
    project_dir (Path): The project directory.
    subcommand (Optional[str]): The subcommand or None. If a subcommand is
        provided, the subcommand help is shown. Otherwise, the top-level help
        and a list of available commands is printed.
    """
    config = load_project_config(project_dir)
    setup_check_dvc(project_dir, config)
    config_commands = config.get("commands", [])
    commands = {cmd["name"]: cmd for cmd in config_commands}
    if subcommand:
        validate_subcommand(commands.keys(), subcommand)
        print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
        help_text = commands[subcommand].get("help")
        if help_text:
            msg.text(f"\n{help_text}\n")
    else:
        print(f"\nAvailable commands in {CONFIG_FILE}")
        print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
        msg.text("Run all commands defined in the 'run' block of the project config:")
        print(f"{COMMAND} project run-all {project_dir}")
 def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
    """Run a named script defined in the project config. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.
    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    *dvc_args: Other arguments passed to "dvc repro".
    """
    config = load_project_config(project_dir)
    setup_check_dvc(project_dir, config)
    config_commands = config.get("commands", [])
    variables = config.get("variables", {})
    commands = {cmd["name"]: cmd for cmd in config_commands}
    validate_subcommand(commands.keys(), subcommand)
    if subcommand in config.get("run", []):
        # This is one of the pipeline commands tracked in DVC
        dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
        with working_dir(project_dir):
            run_command(dvc_cmd)
    else:
        cmd = commands[subcommand]
        # Deps in non-DVC commands aren't tracked, but if they're defined,
        # make sure they exist before running the command
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                msg.fail(err, exits=1)
        with working_dir(project_dir):
            run_commands(cmd["script"], variables)
 def project_exec(project_dir: Path, subcommand: str):
    """Execute a command defined in the project config.
    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    """
    config = load_project_config(project_dir)
    config_commands = config.get("commands", [])
    variables = config.get("variables", {})
    commands = {cmd["name"]: cmd for cmd in config_commands}
    with working_dir(project_dir):
        run_commands(commands[subcommand]["script"], variables)
 ###########
 # HELPERS #
 ###########
 def load_project_config(path: Path) -> Dict[str, Any]:
    """Load the project config file from a directory and validate it.
    path (Path): The path to the project directory.
    RETURNS (Dict[str, Any]): The loaded project config.
    """
    config_path = path / CONFIG_FILE
    if not config_path.exists():
        msg.fail("Can't find project config", config_path, exits=1)
    invalid_err = f"Invalid project config in {CONFIG_FILE}"
    try:
        config = srsly.read_yaml(config_path)
    except ValueError as e:
        msg.fail(invalid_err, e, exits=1)
    errors = validate(ProjectConfigSchema, config)
    if errors:
        msg.fail(invalid_err, "\n".join(errors), exits=1)
    return config
 def update_dvc_config(
    path: Path,
    config: Dict[str, Any],
    verbose: bool = False,
    silent: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
    project directory. The file is auto-generated based on the config. The
    first line of the auto-generated file specifies the hash of the config
    dict, so if any of the config values change, the DVC config is regenerated.
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project config.
    verbose (bool): Whether to print additional info (via DVC).
    silent (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
    config_hash = get_hash(config)
    path = path.resolve()
    dvc_config_path = path / DVC_CONFIG
    if dvc_config_path.exists():
        # Check if the file was generated using the current config, if not, redo
        with dvc_config_path.open("r", encoding="utf8") as f:
            ref_hash = f.readline().strip().replace("# ", "")
        if ref_hash == config_hash and not force:
            return False  # Nothing has changed in project config, don't need to update
        dvc_config_path.unlink()
    variables = config.get("variables", {})
    commands = []
    # We only want to include commands that are part of the main list of "run"
    # commands in project.yml and should be run in sequence
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    for name in config.get("run", []):
        validate_subcommand(config_commands.keys(), name)
        command = config_commands[name]
        deps = command.get("deps", [])
        outputs = command.get("outputs", [])
        outputs_no_cache = command.get("outputs_no_cache", [])
        if not deps and not outputs and not outputs_no_cache:
            continue
        # Default to the working dir as the project path since dvc.yaml is auto-generated
        # and we don't want arbitrary paths in there
        project_cmd = ["python", "-m", NAME, "project", "exec", name]
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
        dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
        if verbose:
            dvc_cmd.append("--verbose")
        if silent:
            dvc_cmd.append("--quiet")
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        commands.append(" ".join(full_cmd))
    with working_dir(path):
        run_commands(commands, variables, silent=True)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
    return True
 def ensure_dvc() -> None:
    """Ensure that the "dvc" command is available and show an error if not."""
    try:
        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
            "You can install the Python package from pip (pip install dvc) or "
            "conda (conda install -c conda-forge dvc). For more details, see the "
            "documentation: https://dvc.org/doc/install",
            exits=1,
        )
 def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
    """Check that the project is set up correctly with DVC and update its
    config if needed. Will raise an error if the project is not an initialized
    DVC project.
    project_dir (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project config.
    """
    if not project_dir.exists():
        msg.fail(f"Can't find project directory: {project_dir}")
    if not (project_dir / ".dvc").exists():
        msg.fail(
            "Project not initialized as a DVC project.",
            f"Make sure that the project template was cloned correctly. To "
            f"initialize the project directory manually, you can run: "
            f"{COMMAND} project init {project_dir}",
            exits=1,
        )
    with msg.loading("Updating DVC config..."):
        updated = update_dvc_config(project_dir, config, silent=True)
    if updated:
        msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
 def run_commands(
    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
 ) -> None:
    """Run a sequence of commands in a subprocess, in order.
    commands (List[str]): The string commands.
    variables (Dict[str, str]): Dictionary of variable names, mapped to their
        values. Will be used to substitute format string variables in the
        commands.
    silent (bool): Don't print the commands.
    """
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        command = split_command(command)
        # Not sure if this is needed or a good idea. Motivation: users may often
        # use commands in their config that reference "python" and we want to
        # make sure that it's always executing the same Python that spaCy is
        # executed with and the pip in the same env, not some other Python/pip.
        # Also ensures cross-compatibility if user 1 writes "python3" (because
        # that's how it's set up on their system), and user 2 without the
        # shortcut tries to re-run the command.
        if len(command) and command[0] in ("python", "python3"):
            command[0] = sys.executable
        elif len(command) and command[0] in ("pip", "pip3"):
            command = [sys.executable, "-m", "pip", *command[1:]]
        if not silent:
            print(f"Running command: {' '.join(command)}")
        run_command(command)
 def convert_asset_url(url: str) -> str:
    """Check and convert the asset URL if needed.
    url (str): The asset URL.
    RETURNS (str): The converted URL.
    """
    # If the asset URL is a regular GitHub URL it's likely a mistake
    if re.match("(http(s?)):\/\/github.com", url):
        converted = url.replace("github.com", "raw.githubusercontent.com")
        converted = re.sub(r"/(tree|blob)/", "/", converted)
        msg.warn(
            "Downloading from a regular GitHub URL. This will only download "
            "the source of the page, not the actual file. Converting the URL "
            "to a raw URL.",
            converted,
        )
        return converted
    return url
 def check_clone(name: str, dest: Path, repo: str) -> None:
    """Check and validate that the destination path can be used to clone. Will
    check that Git is available and that the destination path is suitable.
    name (str): Name of the directory to clone from the repo.
    dest (Path): Local destination of cloned directory.
    repo (str): URL of the repo to clone from.
    """
    try:
        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            f"Cloning spaCy project templates requires Git and the 'git' command. ",
            f"To clone a project without Git, copy the files from the '{name}' "
            f"directory in the {repo} to {dest} manually and then run:",
            f"{COMMAND} project init {dest}",
            exits=1,
        )
    if not dest:
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
    if dest.exists():
        # Directory already exists (not allowed, clone needs to create it)
        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
    if not dest.parent.exists():
        # We're not creating parents, parent dir should exist
        msg.fail(
            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
            exits=1,
        )
 def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
    """Check that a subcommand is valid and defined. Raises an error otherwise.
    commands (Sequence[str]): The available commands.
    subcommand (str): The subcommand.
    """
    if subcommand not in commands:
        msg.fail(
            f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
            f"Available commands: {', '.join(commands)}",
            exits=1,
        )
 def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
    """Download a file using requests.
    url (str): The URL of the file.
    dest (Path): The destination path.
    chunk_size (int): The size of chunks to read/write.
    """
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    progress_settings = {
        "total": total,
        "unit": "iB",
        "unit_scale": True,
        "unit_divisor": chunk_size,
        "leave": False,
    }
    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = f.write(data)
            bar.update(size)
--- a/spacy/cli/project/init.py
+++ b/spacy/cli/project/init.py
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -0,0 +1,154 @@
 from typing import Optional
 from pathlib import Path
 from wasabi import msg
 import requests
 import tqdm
 import re
 import shutil
 from ...util import ensure_path, get_checksum, working_dir
 from .._app import project_cli, Arg
 from .util import PROJECT_FILE, load_project_config
 # TODO: find a solution for caches
 # CACHES = [
 #     Path.home() / ".torch",
 #     Path.home() / ".caches" / "torch",
 #     os.environ.get("TORCH_HOME"),
 #     Path.home() / ".keras",
 # ]
@project_cli.command("assets")
 def project_assets_cli(
    # fmt: off
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Fetch project assets like datasets and pretrained weights. Assets are
    defined in the "assets" section of the project.yml. If a checksum is
    provided in the project.yml, the file is only downloaded if no local file
    with the same checksum exists.
    """
    project_assets(project_dir)
 def project_assets(project_dir: Path) -> None:
    """Fetch assets for a project using DVC if possible.
    project_dir (Path): Path to project directory.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path)
    assets = config.get("assets", {})
    if not assets:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    variables = config.get("variables", {})
    for asset in assets:
        dest = asset["dest"].format(**variables)
        url = asset.get("url")
        checksum = asset.get("checksum")
        if not url:
            # project.yml defines asset without URL that the user has to place
            check_private_asset(dest, checksum)
            continue
        url = url.format(**variables)
        fetch_asset(project_path, url, dest, checksum)
 def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
    """Check and validate assets without a URL (private assets that the user
    has to provide themselves) and give feedback about the checksum.
    dest (Path): Desintation path of the asset.
    checksum (Optional[str]): Optional checksum of the expected file.
    """
    if not Path(dest).exists():
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
        msg.warn(err)
    else:
        if checksum and checksum == get_checksum(dest):
            msg.good(f"Asset exists with matching checksum: {dest}")
        else:
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
 def fetch_asset(
    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
 ) -> None:
    """Fetch an asset from a given URL or path. If a checksum is provided and a
    local file exists, it's only re-downloaded if the checksum doesn't match.
    project_path (Path): Path to project directory.
    url (str): URL or path to asset.
    checksum (Optional[str]): Optional expected checksum of local file.
    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
        the asset failed.
    """
    # TODO: add support for caches
    dest_path = (project_path / dest).resolve()
    if dest_path.exists() and checksum:
        # If there's already a file, check for checksum
        if checksum == get_checksum(dest_path):
            msg.good(f"Skipping download with matching checksum: {dest}")
            return dest_path
    with working_dir(project_path):
        url = convert_asset_url(url)
        try:
            download_file(url, dest_path)
            msg.good(f"Downloaded asset {dest}")
        except requests.exceptions.RequestException as e:
            if Path(url).exists() and Path(url).is_file():
                # If it's a local file, copy to destination
                shutil.copy(url, str(dest_path))
                msg.good(f"Copied local asset {dest}")
            else:
                msg.fail(f"Download failed: {dest}", e)
                return
    if checksum and checksum != get_checksum(dest_path):
        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
 def convert_asset_url(url: str) -> str:
    """Check and convert the asset URL if needed.
    url (str): The asset URL.
    RETURNS (str): The converted URL.
    """
    # If the asset URL is a regular GitHub URL it's likely a mistake
    if re.match(r"(http(s?)):\/\/github.com", url):
        converted = url.replace("github.com", "raw.githubusercontent.com")
        converted = re.sub(r"/(tree|blob)/", "/", converted)
        msg.warn(
            "Downloading from a regular GitHub URL. This will only download "
            "the source of the page, not the actual file. Converting the URL "
            "to a raw URL.",
            converted,
        )
        return converted
    return url
 def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
    """Download a file using requests.
    url (str): The URL of the file.
    dest (Path): The destination path.
    chunk_size (int): The size of chunks to read/write.
    """
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    progress_settings = {
        "total": total,
        "unit": "iB",
        "unit_scale": True,
        "unit_divisor": chunk_size,
        "leave": False,
    }
    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = f.write(data)
            bar.update(size)
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -0,0 +1,110 @@
 from pathlib import Path
 from wasabi import msg
 import subprocess
 import shutil
 from ... import about
 from ...util import ensure_path, run_command, make_tempdir
 from .._app import project_cli, Arg, Opt, COMMAND
 DIRS = [
    "assets",
    "metas",
    "configs",
    "packages",
    "metrics",
    "scripts",
    "notebooks",
    "training",
    "corpus",
 ]
@project_cli.command("clone")
 def project_clone_cli(
    # fmt: off
    name: str = Arg(..., help="The name of the template to fetch"),
    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
    only download the files from the given subdirectory. The GitHub repo
    defaults to the official spaCy template repo, but can be customized
    (including using a private repo). Setting the --git flag will also
    initialize the project directory as a Git repo. If the project is intended
    to be a Git repo, it should be initialized with Git first, before
    initializing DVC (Data Version Control). This allows DVC to integrate with
    Git.
    """
    if dest == Path.cwd():
        dest = dest / name
    project_clone(name, dest, repo=repo)
 def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
    """Clone a project template from a repository.
    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
        try:
            run_command(cmd)
        except subprocess.CalledProcessError:
            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
            msg.fail(err)
        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
            f.write(name)
        try:
            run_command(["git", "-C", str(tmp_dir), "fetch"])
            run_command(["git", "-C", str(tmp_dir), "checkout"])
        except subprocess.CalledProcessError:
            err = f"Could not clone '{name}' in the repo '{repo}'."
            msg.fail(err)
        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
    for sub_dir in DIRS:
        dir_path = project_dir / sub_dir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    msg.good(f"Your project is now ready!", dest)
    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
 def check_clone(name: str, dest: Path, repo: str) -> None:
    """Check and validate that the destination path can be used to clone. Will
    check that Git is available and that the destination path is suitable.
    name (str): Name of the directory to clone from the repo.
    dest (Path): Local destination of cloned directory.
    repo (str): URL of the repo to clone from.
    """
    try:
        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            f"Cloning spaCy project templates requires Git and the 'git' command. ",
            f"To clone a project without Git, copy the files from the '{name}' "
            f"directory in the {repo} to {dest} manually and then run:",
            f"{COMMAND} project init {dest}",
            exits=1,
        )
    if not dest:
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
    if dest.exists():
        # Directory already exists (not allowed, clone needs to create it)
        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
    if not dest.parent.exists():
        # We're not creating parents, parent dir should exist
        msg.fail(
            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
            exits=1,
        )
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -0,0 +1,206 @@
 """This module contains helpers and subcommands for integrating spaCy projects
 with Data Version Controk (DVC). https://dvc.org"""
 from typing import Dict, Any, List, Optional
 import subprocess
 from pathlib import Path
 from wasabi import msg
 from .util import PROJECT_FILE, load_project_config
 from .._app import project_cli, Arg, Opt, NAME, COMMAND
 from ...util import get_hash, working_dir, split_command, join_command, run_command
 DVC_CONFIG = "dvc.yaml"
 DVC_DIR = ".dvc"
 UPDATE_COMMAND = "dvc"
 DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
 # edited your {PROJECT_FILE}, you can regenerate this file by running:
 # {COMMAND} project {UPDATE_COMMAND}"""
@project_cli.command(UPDATE_COMMAND)
 def project_update_dvc_cli(
    # fmt: off
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
    """Auto-generate Data Version Control (DVC) config. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. If no workflow is specified, the first defined
    workflow is used. The DVC config will only be updated if
    """
    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
 def project_update_dvc(
    project_dir: Path,
    workflow: Optional[str] = None,
    *,
    verbose: bool = False,
    force: bool = False,
 ) -> None:
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. Will only update the file if the checksum changed.
    project_dir (Path): The project directory.
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
        If not set, the first workflow will be used.
    verbose (bool): Print more info.
    force (bool): Force update DVC config.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(
        project_dir, config, workflow, verbose=verbose, force=force
    )
    help_msg = "To execute the workflow with DVC, run: dvc repro"
    if updated:
        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
    else:
        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
 def update_dvc_config(
    path: Path,
    config: Dict[str, Any],
    workflow: Optional[str] = None,
    verbose: bool = False,
    silent: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
    project directory. The file is auto-generated based on the config. The
    first line of the auto-generated file specifies the hash of the config
    dict, so if any of the config values change, the DVC config is regenerated.
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project.yml.
    verbose (bool): Whether to print additional info (via DVC).
    silent (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
    ensure_dvc(path)
    workflows = config.get("workflows", {})
    workflow_names = list(workflows.keys())
    check_workflows(workflow_names, workflow)
    if not workflow:
        workflow = workflow_names[0]
    config_hash = get_hash(config)
    path = path.resolve()
    dvc_config_path = path / DVC_CONFIG
    if dvc_config_path.exists():
        # Check if the file was generated using the current config, if not, redo
        with dvc_config_path.open("r", encoding="utf8") as f:
            ref_hash = f.readline().strip().replace("# ", "")
        if ref_hash == config_hash and not force:
            return False  # Nothing has changed in project.yml, don't need to update
        dvc_config_path.unlink()
    variables = config.get("variables", {})
    dvc_commands = []
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    for name in workflows[workflow]:
        command = config_commands[name]
        deps = command.get("deps", [])
        outputs = command.get("outputs", [])
        outputs_no_cache = command.get("outputs_no_cache", [])
        if not deps and not outputs and not outputs_no_cache:
            continue
        # Default to the working dir as the project path since dvc.yaml is auto-generated
        # and we don't want arbitrary paths in there
        project_cmd = ["python", "-m", NAME, "project", "run", name]
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        dvc_commands.append(join_command(full_cmd))
    with working_dir(path):
        dvc_flags = {"--verbose": verbose, "--quiet": silent}
        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
    return True
 def run_dvc_commands(
    commands: List[str] = tuple(),
    variables: Dict[str, str] = {},
    flags: Dict[str, bool] = {},
 ) -> None:
    """Run a sequence of DVC commands in a subprocess, in order.
    commands (List[str]): The string commands without the leading "dvc".
    variables (Dict[str, str]): Dictionary of variable names, mapped to their
        values. Will be used to substitute format string variables in the
        commands.
    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
        easier to pass flags like --quiet that depend on a variable or
        command-line setting while avoiding lots of nested conditionals.
    """
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        command = split_command(command)
        dvc_command = ["dvc", *command]
        # Add the flags if they are set to True
        for flag, is_active in flags.items():
            if is_active:
                dvc_command.append(flag)
        run_command(dvc_command)
 def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
    """Validate workflows provided in project.yml and check that a given
    workflow can be used to generate a DVC config.
    workflows (List[str]): Names of the available workflows.
    workflow (Optional[str]): The name of the workflow to convert.
    """
    if not workflows:
        msg.fail(
            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
            f"define at least one list of commands.",
            exits=1,
        )
    if workflow is not None and workflow not in workflows:
        msg.fail(
            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
            f"Available workflows: {', '.join(workflows)}",
            exits=1,
        )
    if not workflow:
        msg.warn(
            f"No workflow specified for DVC pipeline. Using the first workflow "
            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
        )
 def ensure_dvc(project_dir: Path) -> None:
    """Ensure that the "dvc" command is available and that the current project
    directory is an initialized DVC project.
    """
    try:
        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            "To use spaCy projects with DVC (Data Version Control), DVC needs "
            "to be installed and the 'dvc' command needs to be available",
            "You can install the Python package from pip (pip install dvc) or "
            "conda (conda install -c conda-forge dvc). For more details, see the "
            "documentation: https://dvc.org/doc/install",
            exits=1,
        )
    if not (project_dir / ".dvc").exists():
        msg.fail(
            "Project not initialized as a DVC project",
            "To initialize a DVC project, you can run 'dvc init' in the project "
            "directory. For more details, see the documentation: "
            "https://dvc.org/doc/command-reference/init",
            exits=1,
        )
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -0,0 +1,250 @@
 from typing import Optional, List, Dict, Sequence, Any
 from pathlib import Path
 from wasabi import msg
 import typer
 import sys
 import srsly
 from ...util import working_dir, run_command, split_command, is_cwd, get_checksum
 from ...util import get_hash, join_command
 from .._app import project_cli, Arg, Opt, COMMAND
 from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config
@project_cli.command(
    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def project_run_cli(
    # fmt: off
    ctx: typer.Context,
    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
    """Run a named script or workflow defined in the project.yml. If a workflow
    name is specified, all commands in the workflow are run, in order. If
    commands define inputs and/or outputs, they will only be re-run if state
    has changed.
    """
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
    else:
        project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry)
 def project_run(
    project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
 ) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.
    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    force (bool): Force re-running, even if nothing changed.
    dry (bool): Perform a dry run and don't execute commands.
    """
    config = load_project_config(project_dir)
    variables = config.get("variables", {})
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
            project_run(project_dir, cmd, force=force, dry=dry)
    else:
        cmd = commands[subcommand]
        variables = config.get("variables", {})
        for dep in cmd.get("deps", []):
            dep = dep.format(**variables)
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, **err_kwargs)
        with working_dir(project_dir) as current_dir:
            rerun = check_rerun(current_dir, cmd, variables)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                msg.divider(subcommand)
                run_commands(cmd["script"], variables, dry=dry)
                update_lockfile(current_dir, cmd, variables)
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
    """Simulate a CLI help prompt using the info available in the project.yml.
    project_dir (Path): The project directory.
    subcommand (Optional[str]): The subcommand or None. If a subcommand is
        provided, the subcommand help is shown. Otherwise, the top-level help
        and a list of available commands is printed.
    """
    config = load_project_config(project_dir)
    config_commands = config.get("commands", [])
    commands = {cmd["name"]: cmd for cmd in config_commands}
    project_loc = "" if is_cwd(project_dir) else project_dir
    if subcommand:
        validate_subcommand(commands.keys(), subcommand)
        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
        help_text = commands[subcommand].get("help")
        if help_text:
            msg.text(f"\n{help_text}\n")
    else:
        print(f"\nAvailable commands in {PROJECT_FILE}")
        print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
        print(f"{COMMAND} project run {project_loc}")
 def run_commands(
    commands: List[str] = tuple(),
    variables: Dict[str, Any] = {},
    silent: bool = False,
    dry: bool = False,
 ) -> None:
    """Run a sequence of commands in a subprocess, in order.
    commands (List[str]): The string commands.
    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
        values. Will be used to substitute format string variables in the
        commands.
    silent (bool): Don't print the commands.
    dry (bool): Perform a dry run and don't execut anything.
    """
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        command = split_command(command)
        # Not sure if this is needed or a good idea. Motivation: users may often
        # use commands in their config that reference "python" and we want to
        # make sure that it's always executing the same Python that spaCy is
        # executed with and the pip in the same env, not some other Python/pip.
        # Also ensures cross-compatibility if user 1 writes "python3" (because
        # that's how it's set up on their system), and user 2 without the
        # shortcut tries to re-run the command.
        if len(command) and command[0] in ("python", "python3"):
            command[0] = sys.executable
        elif len(command) and command[0] in ("pip", "pip3"):
            command = [sys.executable, "-m", "pip", *command[1:]]
        if not silent:
            print(f"Running command: {join_command(command)}")
        if not dry:
            run_command(command)
 def validate_subcommand(
    commands: Sequence[str], workflows: Sequence[str], subcommand: str
 ) -> None:
    """Check that a subcommand is valid and defined. Raises an error otherwise.
    commands (Sequence[str]): The available commands.
    subcommand (str): The subcommand.
    """
    if not commands and not workflows:
        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
    if subcommand not in commands and subcommand not in workflows:
        help_msg = []
        if commands:
            help_msg.append(f"Available commands: {', '.join(commands)}")
        if workflows:
            help_msg.append(f"Available workflows: {', '.join(workflows)}")
        msg.fail(
            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
            ". ".join(help_msg),
            exits=1,
        )
 def check_rerun(
    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
 ) -> bool:
    """Check if a command should be rerun because its settings or inputs/outputs
    changed.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    RETURNS (bool): Whether to re-run the command.
    """
    lock_path = project_dir / PROJECT_LOCK
    if not lock_path.exists():  # We don't have a lockfile, run command
        return True
    data = srsly.read_yaml(lock_path)
    if command["name"] not in data:  # We don't have info about this command
        return True
    entry = data[command["name"]]
    # If the entry in the lockfile matches the lockfile entry that would be
    # generated from the current command, we don't rerun because it means that
    # all inputs/outputs, hashes and scripts are the same and nothing changed
    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
 def update_lockfile(
    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
 ) -> None:
    """Update the lockfile after running a command. Will create a lockfile if
    it doesn't yet exist and will add an entry for the current command, its
    script and dependencies/outputs.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    """
    lock_path = project_dir / PROJECT_LOCK
    if not lock_path.exists():
        srsly.write_yaml(lock_path, {})
        data = {}
    else:
        data = srsly.read_yaml(lock_path)
    data[command["name"]] = get_lock_entry(project_dir, command, variables)
    srsly.write_yaml(lock_path, data)
 def get_lock_entry(
    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
 ) -> Dict[str, Any]:
    """Get a lockfile entry for a given command. An entry includes the command,
    the script (command steps) and a list of dependencies and outputs with
    their paths and file hashes, if available. The format is based on the
    dvc.lock files, to keep things consistent.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    RETURNS (Dict[str, Any]): The lockfile entry.
    """
    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
    return {
        "cmd": f"{COMMAND} run {command['name']}",
        "script": command["script"],
        "deps": deps,
        "outs": [*outs, *outs_nc],
    }
 def get_fileinfo(
    project_dir: Path, paths: List[str], variables: Dict[str, Any]
 ) -> List[Dict[str, str]]:
    """Generate the file information for a list of paths (dependencies, outputs).
    Includes the file path and the file's checksum.
    project_dir (Path): The current project directory.
    paths (List[str]): The file paths.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
    """
    data = []
    for path in paths:
        path = path.format(**variables)
        file_path = project_dir / path
        md5 = get_checksum(file_path) if file_path.exists() else None
        data.append({"path": path, "md5": md5})
    return data
--- a/spacy/cli/project/util.py
+++ b/spacy/cli/project/util.py
@ -0,0 +1,57 @@
 from typing import Dict, Any
 from pathlib import Path
 from wasabi import msg
 import srsly
 from ...schemas import ProjectConfigSchema, validate
 PROJECT_FILE = "project.yml"
 PROJECT_LOCK = "project.lock"
 def load_project_config(path: Path) -> Dict[str, Any]:
    """Load the project.yml file from a directory and validate it.
    path (Path): The path to the project directory.
    RETURNS (Dict[str, Any]): The loaded project.yml.
    """
    config_path = path / PROJECT_FILE
    if not config_path.exists():
        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
    try:
        config = srsly.read_yaml(config_path)
    except ValueError as e:
        msg.fail(invalid_err, e, exits=1)
    errors = validate(ProjectConfigSchema, config)
    if errors:
        msg.fail(invalid_err, "\n".join(errors), exits=1)
    validate_project_commands(config)
    return config
 def validate_project_commands(config: Dict[str, Any]) -> None:
    """Check that project commands and workflows are valid, don't contain
    duplicates, don't clash  and only refer to commands that exist.
    config (Dict[str, Any]): The loaded config.
    """
    command_names = [cmd["name"] for cmd in config.get("commands", [])]
    workflows = config.get("workflows", {})
    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
    if duplicates:
        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
        msg.fail(err, exits=1)
    for workflow_name, workflow_steps in workflows.items():
        if workflow_name in command_names:
            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
            msg.fail(err, exits=1)
        for step in workflow_steps:
            if step not in command_names:
                msg.fail(
                    f"Unknown command specified in workflow '{workflow_name}': {step}",
                    f"Workflows can only refer to commands defined in the 'commands' "
                    f"section of the {PROJECT_FILE}.",
                    exits=1,
                )
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -203,7 +203,8 @@ def train(
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
        train_examples = list(
            corpus.train_dataset(
-                nlp, shuffle=False, gold_preproc=training["gold_preproc"]
+                nlp, shuffle=False, gold_preproc=training["gold_preproc"],
                max_length=training["max_length"]
            )
        )
        nlp.begin_training(lambda: train_examples)
@ -306,11 +307,18 @@ def create_train_batches(nlp, corpus, cfg):
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
        epoch += 1
-        batches = util.minibatch_by_words(
+        if cfg.get("batch_by_words", True):
-            train_examples,
+            batches = util.minibatch_by_words(
-            size=cfg["batch_size"],
+                train_examples,
-            discard_oversize=cfg["discard_oversize"],
+                size=cfg["batch_size"],
-        )
+                discard_oversize=cfg["discard_oversize"],
            )
        else:
            batches = util.minibatch(
                train_examples,
                size=cfg["batch_size"],
            )
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
            first = next(batches)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,15 +477,14 @@ class Errors(object):
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
    # TODO: fix numbering after merging develop into master
    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
            "array and {doc_length} for the Doc itself.")
    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
    E973 = ("Unexpected type for NER data")
    E974 = ("Unknown {obj} attribute: {key}")
-    E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
+    E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
            "but got {type}")
    E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
            "but received None.")
    E977 = ("Can not compare a MorphAnalysis with a string object. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
--- a/spacy/gold/init.py
+++ b/spacy/gold/init.py
@ -1,6 +1,6 @@
 from .corpus import Corpus
 from .example import Example
-from .align import align
+from .align import Alignment
 from .iob_utils import iob_to_biluo, biluo_to_iob
 from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
--- a/spacy/gold/align.pxd
+++ b/spacy/gold/align.pxd
@ -1,8 +0,0 @@
 cdef class Alignment:
    cdef public object cost
    cdef public object i2j
    cdef public object j2i
    cdef public object i2j_multi
    cdef public object j2i_multi
    cdef public object cand_to_gold
    cdef public object gold_to_cand
--- a/spacy/gold/align.py
+++ b/spacy/gold/align.py
@ -0,0 +1,30 @@
 from typing import List
 import numpy
 from thinc.types import Ragged
 from dataclasses import dataclass
 import tokenizations
@dataclass
 class Alignment:
    x2y: Ragged
    y2x: Ragged
    @classmethod
    def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
        x2y = _make_ragged(x2y)
        y2x = _make_ragged(y2x)
        return Alignment(x2y=x2y, y2x=y2x)
    @classmethod
    def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
        x2y, y2x = tokenizations.get_alignments(A, B)
        return Alignment.from_indices(x2y=x2y, y2x=y2x)
 def _make_ragged(indices):
    lengths = numpy.array([len(x) for x in indices], dtype="i")
    flat = []
    for x in indices:
        flat.extend(x)
    return Ragged(numpy.array(flat, dtype="i"), lengths)
--- a/spacy/gold/align.pyx
+++ b/spacy/gold/align.pyx
@ -1,101 +0,0 @@
 import numpy
 from ..errors import Errors, AlignmentError
 cdef class Alignment:
    def __init__(self, spacy_words, gold_words):
        # Do many-to-one alignment for misaligned tokens.
        # If we over-segment, we'll have one gold word that covers a sequence
        # of predicted words
        # If we under-segment, we'll have one predicted word that covers a
        # sequence of gold words.
        # If we "mis-segment", we'll have a sequence of predicted words covering
        # a sequence of gold words. That's many-to-many -- we don't do that
        # except for NER spans where the start and end can be aligned.
        cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
        self.cost = cost
        self.i2j = i2j
        self.j2i = j2i
        self.i2j_multi = i2j_multi
        self.j2i_multi = j2i_multi
        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
 def align(tokens_a, tokens_b):
    """Calculate alignment tables between two tokenizations.
    tokens_a (List[str]): The candidate tokenization.
    tokens_b (List[str]): The reference tokenization.
    RETURNS: (tuple): A 5-tuple consisting of the following information:
      * cost (int): The number of misaligned tokens.
      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
        it has the value -1.
      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
        the same token of `tokens_b`.
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
    tokens_a = _normalize_for_alignment(tokens_a)
    tokens_b = _normalize_for_alignment(tokens_b)
    cost = 0
    a2b = numpy.empty(len(tokens_a), dtype="i")
    b2a = numpy.empty(len(tokens_b), dtype="i")
    a2b.fill(-1)
    b2a.fill(-1)
    a2b_multi = {}
    b2a_multi = {}
    i = 0
    j = 0
    offset_a = 0
    offset_b = 0
    while i < len(tokens_a) and j < len(tokens_b):
        a = tokens_a[i][offset_a:]
        b = tokens_b[j][offset_b:]
        if a == b:
            if offset_a == offset_b == 0:
                a2b[i] = j
                b2a[j] = i
            elif offset_a == 0:
                cost += 2
                a2b_multi[i] = j
            elif offset_b == 0:
                cost += 2
                b2a_multi[j] = i
            offset_a = offset_b = 0
            i += 1
            j += 1
        elif a == "":
            assert offset_a == 0
            cost += 1
            i += 1
        elif b == "":
            assert offset_b == 0
            cost += 1
            j += 1
        elif b.startswith(a):
            cost += 1
            if offset_a == 0:
                a2b_multi[i] = j
            i += 1
            offset_a = 0
            offset_b += len(a)
        elif a.startswith(b):
            cost += 1
            if offset_b == 0:
                b2a_multi[j] = i
            j += 1
            offset_b = 0
            offset_a += len(b)
        else:
            assert "".join(tokens_a) != "".join(tokens_b)
            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
    return cost, a2b, b2a, a2b_multi, b2a_multi
 def _normalize_for_alignment(tokens):
    return [w.replace(" ", "").lower() for w in tokens]
--- a/spacy/gold/example.pxd
+++ b/spacy/gold/example.pxd
@ -1,8 +1,7 @@
 from ..tokens.doc cimport Doc
 from .align cimport Alignment
 cdef class Example:
    cdef readonly Doc x
    cdef readonly Doc y
-    cdef readonly Alignment _alignment
+    cdef readonly object _alignment
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -6,10 +6,9 @@ from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
-from .align cimport Alignment
+from .align import Alignment
 from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .iob_utils import spans_from_biluo_tags
 from .align import Alignment
 from ..errors import Errors, Warnings
 from ..syntax import nonproj
@ -28,8 +27,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 cdef class Example:
-    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
+    def __init__(self, Doc predicted, Doc reference, *, alignment=None):
        """ Doc can either be text, or an actual Doc """
        if predicted is None:
            raise TypeError(Errors.E972.format(arg="predicted"))
        if reference is None:
@ -60,17 +58,15 @@ cdef class Example:
    @classmethod
    def from_dict(cls, Doc predicted, dict example_dict):
        if predicted is None:
            raise ValueError(Errors.E976.format(n="first", type="Doc"))
        if example_dict is None:
-            raise ValueError(Errors.E976)
+            raise ValueError(Errors.E976.format(n="second", type="dict"))
        if not isinstance(predicted, Doc):
            raise TypeError(Errors.E975.format(type=type(predicted)))
        example_dict = _fix_legacy_dict_data(example_dict)
        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
        if "ORTH" not in tok_dict:
            tok_dict["ORTH"] = [tok.text for tok in predicted]
            tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
        if not _has_field(tok_dict, "SPACY"):
            spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
        return Example(
            predicted,
            annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -83,34 +79,38 @@ cdef class Example:
            gold_words = [token.orth_ for token in self.reference]
            if gold_words == []:
                gold_words = spacy_words
-            self._alignment = Alignment(spacy_words, gold_words)
+            self._alignment = Alignment.from_strings(spacy_words, gold_words)
        return self._alignment
    def get_aligned(self, field, as_string=False):
        """Return an aligned array for a token attribute."""
-        i2j_multi = self.alignment.i2j_multi
+        align = self.alignment.x2y
        cand_to_gold = self.alignment.cand_to_gold
        vocab = self.reference.vocab
        gold_values = self.reference.to_array([field])
        output = [None] * len(self.predicted)
-        for i, gold_i in enumerate(cand_to_gold):
+        for token in self.predicted:
-            if self.predicted[i].text.isspace():
+            if token.is_space:
-                output[i] = None
+                output[token.i] = None
            if gold_i is None:
                if i in i2j_multi:
                    output[i] = gold_values[i2j_multi[i]]
                else:
                    output[i] = None
            else:
-                output[i] = gold_values[gold_i]
+                values = gold_values[align[token.i].dataXd]
                values = values.ravel()
                if len(values) == 0:
                    output[token.i] = None
                elif len(values) == 1:
                    output[token.i] = values[0]
                elif len(set(list(values))) == 1:
                    # If all aligned tokens have the same value, use it.
                    output[token.i] = values[0]
                else:
                    output[token.i] = None
        if as_string and field not in ["ENT_IOB", "SENT_START"]:
            output = [vocab.strings[o] if o is not None else o for o in output]
        return output
    def get_aligned_parse(self, projectivize=True):
-        cand_to_gold = self.alignment.cand_to_gold
+        cand_to_gold = self.alignment.x2y
-        gold_to_cand = self.alignment.gold_to_cand
+        gold_to_cand = self.alignment.y2x
        aligned_heads = [None] * self.x.length
        aligned_deps = [None] * self.x.length
        heads = [token.head.i for token in self.y]
@ -118,52 +118,51 @@ cdef class Example:
        if projectivize:
            heads, deps = nonproj.projectivize(heads, deps)
        for cand_i in range(self.x.length):
-            gold_i = cand_to_gold[cand_i]
+            if cand_to_gold.lengths[cand_i] == 1:
-            if gold_i is not None: # Alignment found
+                gold_i = cand_to_gold[cand_i].dataXd[0, 0]
-                gold_head = gold_to_cand[heads[gold_i]]
+                if gold_to_cand.lengths[heads[gold_i]] == 1:
-                if gold_head is not None:
+                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
                    aligned_heads[cand_i] = gold_head
                    aligned_deps[cand_i] = deps[gold_i]
        return aligned_heads, aligned_deps
    def get_aligned_spans_x2y(self, x_spans):
        return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
    def get_aligned_spans_y2x(self, y_spans):
        return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
    def _get_aligned_spans(self, doc, spans, align):
        seen = set()
        output = []
        for span in spans:
            indices = align[span.start : span.end].data.ravel()
            indices = [idx for idx in indices if idx not in seen]
            if len(indices) >= 1:
                aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
                target_text = span.text.lower().strip().replace(" ", "")
                our_text = aligned_span.text.lower().strip().replace(" ", "")
                if our_text == target_text:
                    output.append(aligned_span)
                    seen.update(indices)
        return output
    def get_aligned_ner(self):
        if not self.y.is_nered:
            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
-        x_text = self.x.text
+        x_ents = self.get_aligned_spans_y2x(self.y.ents)
-        # Get a list of entities, and make spans for non-entity tokens.
+        # Default to 'None' for missing values
        # We then work through the spans in order, trying to find them in
        # the text and using that to get the offset. Any token that doesn't
        # get a tag set this way is tagged None.
        # This could maybe be improved? It at least feels easy to reason about.
        y_spans = list(self.y.ents)
        y_spans.sort()
        x_text_offset = 0
        x_spans = []
        for y_span in y_spans:
            if x_text.count(y_span.text) >= 1:
                start_char = x_text.index(y_span.text) + x_text_offset
                end_char = start_char + len(y_span.text)
                x_span = self.x.char_span(start_char, end_char, label=y_span.label)
                if x_span is not None:
                    x_spans.append(x_span)
                    x_text = self.x.text[end_char:]
                    x_text_offset = end_char
        x_tags = biluo_tags_from_offsets(
            self.x,
-            [(e.start_char, e.end_char, e.label_) for e in x_spans],
+            [(e.start_char, e.end_char, e.label_) for e in x_ents],
            missing=None
        )
-        gold_to_cand = self.alignment.gold_to_cand
+        # Now fill the tokens we can align to O.
-        for token in self.y:
+        O = 2 # I=1, O=2, B=3
-            if token.ent_iob_ == "O":
+        for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
-                cand_i = gold_to_cand[token.i]
+            if x_tags[i] is None:
-                if cand_i is not None and x_tags[cand_i] is None:
+                if ent_iob == O:
-                    x_tags[cand_i] = "O"
+                    x_tags[i] = "O"
-        i2j_multi = self.alignment.i2j_multi
+                elif self.x[i].is_space:
        for i, tag in enumerate(x_tags):
            if tag is None and i in i2j_multi:
                gold_i = i2j_multi[i]
                if gold_i is not None and self.y[gold_i].ent_iob_ == "O":
                    x_tags[i] = "O"
        return x_tags
@ -194,25 +193,22 @@ cdef class Example:
                links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
        return links
    def split_sents(self):
        """ Split the token annotations into multiple Examples based on
        sent_starts and return a list of the new Examples"""
        if not self.reference.is_sentenced:
            return [self]
-
+        
-        sent_starts = self.get_aligned("SENT_START")
+        align = self.alignment.y2x
-        sent_starts.append(1)   # appending virtual start of a next sentence to facilitate search
+        seen_indices = set()
        output = []
-        pred_start = 0
+        for y_sent in self.reference.sents:
-        for sent in self.reference.sents:
+            indices = align[y_sent.start : y_sent.end].data.ravel()
-            new_ref = sent.as_doc()
+            indices = [idx for idx in indices if idx not in seen_indices]
-            pred_end = sent_starts.index(1, pred_start+1)  # find where the next sentence starts
+            if indices:
-            new_pred = self.predicted[pred_start : pred_end].as_doc()
+                x_sent = self.predicted[indices[0] : indices[-1] + 1]
-            output.append(Example(new_pred, new_ref))
+                output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
-            pred_start = pred_end
+                seen_indices.update(indices)
        return output
    property text:
@ -258,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
            values.append([vocab.morphology.add(v) for v in value])
        else:
            attrs.append(key)
-            values.append([vocab.strings.add(v) for v in value])
+            try:
                values.append([vocab.strings.add(v) for v in value])
            except TypeError:
                types= set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types))
    array = numpy.asarray(values, dtype="uint64")
    return attrs, array.T
--- a/spacy/language.py
+++ b/spacy/language.py
@ -540,19 +540,15 @@ class Language(object):
        if component_cfg is None:
            component_cfg = {}
        component_deps = count_pipeline_interdependencies(self.pipeline)
        # Determine whether component should set annotations. In theory I guess
        # we should do this by inspecting the meta? Or we could just always
        # say "yes"
        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
            component_cfg[name].setdefault("drop", drop)
-            component_cfg[name]["set_annotations"] = bool(component_deps[i])
+            component_cfg[name].setdefault("set_annotations", False)
        for name, proc in self.pipeline:
            if not hasattr(proc, "update"):
                continue
            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd is not False:
+        if sgd not in (None, False):
            for name, proc in self.pipeline:
                if hasattr(proc, "model"):
                    proc.model.finish_update(sgd)
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@ -1,13 +1,14 @@
 from thinc.api import Model, normal_init
-def PrecomputableAffine(nO, nI, nF, nP):
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
    model = Model(
        "precomputable_affine",
        forward,
        init=init,
        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
        params={"W": None, "b": None, "pad": None},
        attrs={"dropout_rate": dropout}
    )
    return model
@ -48,17 +49,14 @@ def forward(model, X, is_train):
        model.inc_grad("b", dY.sum(axis=0))
        dY = dY.reshape((dY.shape[0], nO * nP))
-        Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
+        Wopfi = W.transpose((1, 2, 0, 3))
        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-        # Reuse the buffer
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
        dWopfi = Wopfi
        dWopfi.fill(0.0)
        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
        model.inc_grad("W", dWopfi)
        return dXf.reshape((dXf.shape[0], nF, nI))
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        norm = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
+            nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
            seed=0
        )
        if subword_features:
            prefix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
                seed=1
            )
            suffix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
                seed=2
            )
            shape = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
                seed=3
            )
        else:
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
                    >> Maxout(
                        nO=width,
                        nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                        dropout=0.0,
                        normalize=True,
                    ),
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
                    >> Maxout(
                        nO=width,
                        nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                        dropout=0.0,
                        normalize=True,
                    ),
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
                >> Maxout(
                    nO=width,
                    nI=width * columns,
-                    nP=maxout_pieces,
+                    nP=3,
                    dropout=0.0,
                    normalize=True,
                ),
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
            reduce_dimensions = Maxout(
                nO=width,
                nI=nM * nC + width,
-                nP=maxout_pieces,
+                nP=3,
                dropout=0.0,
                normalize=True,
            )
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
 from ..syntax._parser_model import ParserStepModel
-def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
+def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
    """Set up a stepwise transition-based model"""
    if upper is None:
        has_upper = False
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -272,7 +272,7 @@ cdef class Morphology:
    @staticmethod
    def feats_to_dict(feats):
-        if not feats:
+        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -3,7 +3,7 @@ cimport numpy as np
 import numpy
 import srsly
-from thinc.api import to_categorical
+from thinc.api import SequenceCategoricalCrossentropy
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
            doc.is_morphed = True
    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
-        tag_index = {tag: i for i, tag in enumerate(self.labels)}
+        truths = []
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
        for eg in examples:
            eg_truths = []
            pos_tags = eg.get_aligned("POS", as_string=True)
            morphs = eg.get_aligned("MORPH", as_string=True)
            for i in range(len(morphs)):
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
                    morph = self.vocab.strings[self.vocab.morphology.add(feats)]
                if morph == "":
                    morph = Morphology.EMPTY_MORPH
-                if morph is None:
+                eg_truths.append(morph)
-                    correct[idx] = guesses[idx]
+            truths.append(eg_truths)
-                elif morph in tag_index:
+        d_scores, loss = loss_func(scores, truths)
-                    correct[idx] = tag_index[morph]
+        if self.model.ops.xp.isnan(loss):
-                else:
+            raise ValueError("nan value when computing loss")
                    correct[idx] = 0
                    known_labels[idx] = 0.
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
        docs = [eg.predicted for eg in examples]
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
    def to_bytes(self, exclude=tuple()):
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -334,7 +334,7 @@ class Tagger(Pipe):
            losses[self.name] += (gradient**2).sum()
    def get_loss(self, examples, scores):
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
        truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
@ -521,29 +521,23 @@ class SentenceRecognizer(Tagger):
                        doc.c[j].sent_start = -1
    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
+        labels = self.labels
-        tag_index = range(len(self.labels))
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
-        cdef int idx = 0
+        truths = []
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
        for eg in examples:
-            sent_starts = eg.get_aligned("sent_start")
+            eg_truth = []
-            for sent_start in sent_starts:
+            for x in eg.get_aligned("sent_start"):
-                if sent_start is None:
+                if x == None:
-                    correct[idx] = guesses[idx]
+                    eg_truth.append(None)
-                elif sent_start in tag_index:
+                elif x == 1:
-                    correct[idx] = sent_start
+                    eg_truth.append(labels[1])
                else:
-                    correct[idx] = 0
+                    # anything other than 1: 0, -1, -1 as uint64
-                    known_labels[idx] = 0.
+                    eg_truth.append(labels[0])
-                idx += 1
+            truths.append(eg_truth)
-        correct = self.model.ops.xp.array(correct, dtype="i")
+        d_scores, loss = loss_func(scores, truths)
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
+        if self.model.ops.xp.isnan(loss):
-        d_scores *= self.model.ops.asarray(known_labels)
+            raise ValueError("nan value when computing loss")
        loss = (d_scores**2).sum()
        docs = [eg.predicted for eg in examples]
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
 class ProjectConfigAsset(BaseModel):
    # fmt: off
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
-    url: StrictStr = Field(..., title="URL of asset")
+    url: Optional[StrictStr] = Field(None, title="URL of asset")
    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    # fmt: on
@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel):
    # fmt: off
    variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
    assets: List[ProjectConfigAsset] = Field([], title="Data assets")
-    run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
+    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
    # fmt: on
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -326,10 +326,11 @@ class Scorer(object):
        for token in doc:
            if token.orth_.isspace():
                continue
-            gold_i = align.cand_to_gold[token.i]
+            if align.x2y.lengths[token.i] != 1:
            if gold_i is None:
                self.tokens.fp += 1
                gold_i = None
            else:
                gold_i = align.x2y[token.i].dataXd[0, 0]
                self.tokens.tp += 1
                cand_tags.add((gold_i, token.tag_))
                cand_pos.add((gold_i, token.pos_))
@ -345,7 +346,10 @@ class Scorer(object):
                if token.is_sent_start:
                    cand_sent_starts.add(gold_i)
            if token.dep_.lower() not in punct_labels and token.orth_.strip():
-                gold_head = align.cand_to_gold[token.head.i]
+                if align.x2y.lengths[token.head.i] == 1:
                    gold_head = align.x2y[token.head.i].dataXd[0, 0]
                else:
                    gold_head = None
                # None is indistinct, so we can't just add it to the set
                # Multiple (None, None) deps are possible
                if gold_i is None or gold_head is None:
@ -381,15 +385,9 @@ class Scorer(object):
                gold_ents.add(gold_ent)
                gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
            cand_per_ents = {ent_label: set() for ent_label in ent_labels}
-            for ent in doc.ents:
+            for ent in example.get_aligned_spans_x2y(doc.ents):
-                first = align.cand_to_gold[ent.start]
+                cand_ents.add((ent.label_, ent.start, ent.end - 1))
-                last = align.cand_to_gold[ent.end - 1]
+                cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
                if first is None or last is None:
                    self.ner.fp += 1
                    self.ner_per_ents[ent.label_].fp += 1
                else:
                    cand_ents.add((ent.label_, first, last))
                    cand_per_ents[ent.label_].add((ent.label_, first, last))
            # Scores per ent
            for k, v in self.ner_per_ents.items():
                if k in cand_per_ents:
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
 class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
            dropout=0.1):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.attrs["has_upper"] = has_upper
        self.attrs["dropout_rate"] = dropout
        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
        if layers[1].get_dim("nP") >= 2:
            activation = "maxout"
@ -289,11 +291,17 @@ class ParserStepModel(Model):
        self.bp_tokvecs(d_tokvecs[:-1])
        return d_tokvecs
 NUMPY_OPS = NumpyOps()
 def step_forward(model: ParserStepModel, states, is_train):
    token_ids = model.get_token_ids(states)
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
    mask = None
    if model.attrs["has_upper"]:
        dropout_rate = model.attrs["dropout_rate"]
        if is_train and dropout_rate > 0:
            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
            vector *= mask
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
@ -305,6 +313,8 @@ def step_forward(model: ParserStepModel, states, is_train):
        # Zero vectors for unseen classes
        d_scores *= model._class_mask
        d_vector = get_d_vector(d_scores)
        if mask is not None:
            d_vector *= mask
        if isinstance(model.state2vec.ops, CupyOps) \
        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
            # Move token_ids and d_vector to GPU, asynchronously
@ -437,7 +447,7 @@ cdef class precompute_hiddens:
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector = state_vector + self.bias
+        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
        def backward(d_state_vector_ids):
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -65,7 +65,6 @@ cdef class Parser:
            self.set_output(self.moves.n_moves)
        self.cfg = dict(cfg)
        self.cfg.setdefault("update_with_oracle_cut_size", 100)
        self.cfg.setdefault("normalize_gradients_with_batch_size", True)
        self._multitasks = []
        for multitask in cfg.get("multitasks", []):
            self.add_multitask_objective(multitask)
@ -280,11 +279,12 @@ cdef class Parser:
            [eg.predicted for eg in examples])
        if self.cfg["update_with_oracle_cut_size"] >= 1:
            # Chop sequences into lengths of this many transitions, to make the
-            # batch uniform length. We randomize this to overfit less.
+            # batch uniform length.
            # We used to randomize this, but it's not clear that actually helps?
            cut_size = self.cfg["update_with_oracle_cut_size"]
            states, golds, max_steps = self._init_gold_batch(
                examples,
-                max_length=numpy.random.choice(range(5, cut_size))
+                max_length=cut_size 
            )
        else:
            states, golds, _ = self.moves.init_gold_batch(examples)
@ -292,24 +292,15 @@ cdef class Parser:
        if not states:
            return losses
        all_states = list(states)
-        states_golds = zip(states, golds)
+        states_golds = list(zip(states, golds))
-        for _ in range(max_steps):
+        while states_golds:
            if not states_golds:
                break
            states, golds = zip(*states_golds)
            scores, backprop = model.begin_update(states)
            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            if self.cfg["normalize_gradients_with_batch_size"]:
+            # Note that the gradient isn't normalized by the batch size
-                # We have to be very careful how we do this, because of the way we
+            # here, because our "samples" are really the states...But we
-                # cut up the batch. We subdivide long sequences. If we normalize
+            # can't normalize by the number of states either, as then we'd
-                # naively, we end up normalizing by sequence length, which
+            # be getting smaller gradients for states in long sequences.
                # is bad: that would mean that states in long sequences
                # consistently get smaller gradients. Imagine if we have two
                # sequences, one length 1000, one length 20. If we cut up
                # the 1k sequence so that we have a "batch" of 50 subsequences,
                # we don't want the gradients to get 50 times smaller!
                d_scores /= n_examples
            backprop(d_scores)
            # Follow the predicted action
            self.transition_states(states, scores)
@ -407,6 +398,7 @@ cdef class Parser:
            cpu_log_loss(c_d_scores,
                costs, is_valid, &scores[i, 0], d_scores.shape[1])
            c_d_scores += d_scores.shape[1]
        # Note that we don't normalize this. See comment in update() for why.
        if losses is not None:
            losses.setdefault(self.name, 0.)
            losses[self.name] += (d_scores**2).sum()
@ -525,21 +517,25 @@ cdef class Parser:
            StateClass state
            Transition action
        all_states = self.moves.init_batch([eg.predicted for eg in examples])
        states = []
        golds = []
        kept = []
        max_length_seen = 0
        for state, eg in zip(all_states, examples):
            if self.moves.has_gold(eg) and not state.is_final():
                gold = self.moves.init_gold(state, eg)
-                oracle_actions = self.moves.get_oracle_sequence_from_state(
+                if len(eg.x) < max_length:
-                    state.copy(), gold)
+                    states.append(state)
-                kept.append((eg, state, gold, oracle_actions))
+                    golds.append(gold)
-                min_length = min(min_length, len(oracle_actions))
+                else:
-                max_length_seen = max(max_length, len(oracle_actions))
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
                        state.copy(), gold)
                    kept.append((eg, state, gold, oracle_actions))
                    min_length = min(min_length, len(oracle_actions))
                    max_length_seen = max(max_length, len(oracle_actions))
        if not kept:
-            return [], [], 0
+            return states, golds, 0
        max_length = max(min_length, min(max_length, max_length_seen))
        states = []
        golds = []
        cdef int clas
        max_moves = 0
        for eg, state, gold, oracle_actions in kept:
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
 def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
    assert contains_cycle(tree) is None
-    assert contains_cycle(cyclic_tree) == set([3, 4, 5])
+    assert contains_cycle(cyclic_tree) == {3, 4, 5}
    assert contains_cycle(partial_tree) is None
    assert contains_cycle(multirooted_tree) is None
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -38,6 +38,11 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # add some cases where SENT_START == -1
    train_examples[0].reference[10].is_sent_start = False
    train_examples[1].reference[1].is_sent_start = False
    train_examples[1].reference[11].is_sent_start = False
    nlp.add_pipe(senter)
    optimizer = nlp.begin_training()
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -23,6 +23,7 @@ def test_issue2070():
    assert len(doc) == 11
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
    assert len(matches) == 3
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
    assert doc[0].like_num
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2800():
    """Test issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    """
    nlp = English()
    train_data = []
-    train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
+    train_data.extend(
        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
    )
    entity_types = [str(i) for i in range(1000)]
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -88,6 +88,7 @@ def test_issue3199():
    assert list(doc[0:3].noun_chunks) == []
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -0,0 +1,472 @@
 import pytest
 from spacy.language import Language
 from spacy.vocab import Vocab
 from spacy.pipeline import EntityRuler, DependencyParser
 from spacy.pipeline.defaults import default_parser
 from spacy import displacy, load
 from spacy.displacy import parse_deps
 from spacy.tokens import Doc, Token
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.errors import MatchPatternError
 from spacy.util import minibatch
 from spacy.gold import Example
 from spacy.lang.hi import Hindi
 from spacy.lang.es import Spanish
 from spacy.lang.en import English
 from spacy.attrs import IS_ALPHA
 from thinc.api import compounding
 import spacy
 import srsly
 import numpy
 from ..util import make_tempdir, get_doc
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
 def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop
 def test_issue_3526_1(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    ruler_bytes = ruler.to_bytes()
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert ruler.overwrite
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(new_ruler) == len(ruler)
    assert len(new_ruler.labels) == 4
    assert new_ruler.overwrite == ruler.overwrite
    assert new_ruler.ent_id_sep == ruler.ent_id_sep
 def test_issue_3526_2(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    for pattern in ruler.patterns:
        assert pattern in new_ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite
 def test_issue_3526_3(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue_3526_4(en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)
    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
 def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html
 def test_issue3540(en_vocab):
    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = numpy.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)
    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)
    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()
    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()
 def test_issue3549(en_vocab):
    """Test that match pattern validation doesn't raise on empty errors."""
    matcher = Matcher(en_vocab, validate=True)
    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
    matcher.add("GOOD", [pattern])
    with pytest.raises(MatchPatternError):
        matcher.add("BAD", [[{"X": "Y"}]])
@pytest.mark.xfail
 def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
 def test_issue3611():
    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training(X=x_train, Y=y_train)
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
 def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp("hi. how हुए. होटल, होटल")
    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
    assert [token.text for token in doc] == expected
 def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)
    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" not in parser.labels
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" in parser.labels
 def test_issue3839(en_vocab):
    """Test that match IDs returned by the matcher are correct, are in the string """
    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
    matcher = Matcher(en_vocab)
    match_id = "PATTERN"
    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
    matcher.add(match_id, [pattern1])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
    matcher = Matcher(en_vocab)
    matcher.add(match_id, [pattern2])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
@pytest.mark.parametrize(
    "sentence",
    [
        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
    ],
 )
 def test_issue3869(sentence):
    """Test that the Doc's count_by function works consistently"""
    nlp = English()
    doc = nlp(sentence)
    count = 0
    for token in doc:
        count += token.is_alpha
    assert count == doc.count_by(IS_ALPHA).get(1, 0)
 def test_issue3879(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    assert len(doc) == 5
    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [pattern])
    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3880():
    """Test that `nlp.pipe()` works when an empty string ends the batch.
    Fixed in v7.0.5 of Thinc.
    """
    texts = ["hello", "world", "", ""]
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("parser"))
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.get_pipe("parser").add_label("dep")
    nlp.get_pipe("ner").add_label("PERSON")
    nlp.get_pipe("tagger").add_label("NN")
    nlp.begin_training()
    for doc in nlp.pipe(texts):
        pass
 def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.is_parsed = True
    doc.user_data["test"] = set()
    parse_deps(doc)
 def test_issue3951(en_vocab):
    """Test that combinations of optional rules are matched correctly."""
    matcher = Matcher(en_vocab)
    pattern = [
        {"LOWER": "hello"},
        {"LOWER": "this", "OP": "?"},
        {"OP": "?"},
        {"LOWER": "world"},
    ]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
    matches = matcher(doc)
    assert len(matches) == 0
 def test_issue3959():
    """ Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"
 def test_issue3962(en_vocab):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    # fmt: off
    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
    # fmt: on
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    # head set to itself, being the new artificial root
    assert doc2[0].head.text == "jests"
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"
    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1
    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json
    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1
 def test_issue3962_long(en_vocab):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    # fmt: off
    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
    # fmt: on
    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].head.text == "jests"
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].head.text == "They"
    assert doc2[4].dep_ == "dep"
    # head set to the new artificial head (in sentence 2)
    assert doc2[4].head.text == "They"
    assert doc2[4].dep_ == "dep"
    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"
 def test_issue3972(en_vocab):
    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
    matches = matcher(doc)
    assert len(matches) == 2
    # We should have a match for each of the two rules
    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
    assert "A" in found_ids
    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,8 +0,0 @@
 import pytest
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
 def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@ -1,85 +0,0 @@
 import pytest
 from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
 from spacy import load
 import srsly
 from ..util import make_tempdir
@pytest.fixture
 def patterns():
    return [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
@pytest.fixture
 def add_ent():
    def add_ent_component(doc):
        doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
        return doc
    return add_ent_component
 def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    ruler_bytes = ruler.to_bytes()
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert ruler.overwrite
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(new_ruler) == len(ruler)
    assert len(new_ruler.labels) == 4
    assert new_ruler.overwrite == ruler.overwrite
    assert new_ruler.ent_id_sep == ruler.ent_id_sep
 def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    for pattern in ruler.patterns:
        assert pattern in new_ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite
 def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
 def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)
    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
--- a/spacy/tests/regression/test_issue3531.py
+++ b/spacy/tests/regression/test_issue3531.py
@ -1,30 +0,0 @@
 from spacy import displacy
 def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html
--- a/spacy/tests/regression/test_issue3540.py
+++ b/spacy/tests/regression/test_issue3540.py
@ -1,44 +0,0 @@
 from spacy.tokens import Doc
 import numpy as np
 def test_issue3540(en_vocab):
    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = np.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)
    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)
    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()
    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@ -1,12 +0,0 @@
 import pytest
 from spacy.matcher import Matcher
 from spacy.errors import MatchPatternError
 def test_issue3549(en_vocab):
    """Test that match pattern validation doesn't raise on empty errors."""
    matcher = Matcher(en_vocab, validate=True)
    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
    matcher.add("GOOD", [pattern])
    with pytest.raises(MatchPatternError):
        matcher.add("BAD", [[{"X": "Y"}]])
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@ -1,14 +0,0 @@
 import pytest
 from spacy.tokens import Doc, Token
 from spacy.matcher import Matcher
@pytest.mark.xfail
 def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -1,45 +0,0 @@
 import spacy
 from spacy.util import minibatch
 from thinc.api import compounding
 from spacy.gold import Example
 def test_issue3611():
    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training(X=x_train, Y=y_train)
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@ -1,9 +0,0 @@
 from spacy.lang.hi import Hindi
 def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp("hi. how हुए. होटल, होटल")
    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
    assert [token.text for token in doc] == expected
--- a/spacy/tests/regression/test_issue3803.py
+++ b/spacy/tests/regression/test_issue3803.py
@ -1,10 +0,0 @@
 from spacy.lang.es import Spanish
 def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)
    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@ -1,34 +0,0 @@
 from spacy.pipeline.pipes import DependencyParser
 from spacy.vocab import Vocab
 from spacy.pipeline.defaults import default_parser
 def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" not in parser.labels
 def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    parser = DependencyParser(Vocab(), default_parser(), **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [])
    assert "subtok" in parser.labels
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@ -1,18 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue3839(en_vocab):
    """Test that match IDs returned by the matcher are correct, are in the string """
    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
    matcher = Matcher(en_vocab)
    match_id = "PATTERN"
    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
    matcher.add(match_id, [pattern1])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
    matcher = Matcher(en_vocab)
    matcher.add(match_id, [pattern2])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@ -1,25 +0,0 @@
 import pytest
 from spacy.attrs import IS_ALPHA
 from spacy.lang.en import English
@pytest.mark.parametrize(
    "sentence",
    [
        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
    ],
 )
 def test_issue3869(sentence):
    """Test that the Doc's count_by function works consistently"""
    nlp = English()
    doc = nlp(sentence)
    count = 0
    for token in doc:
        count += token.is_alpha
    assert count == doc.count_by(IS_ALPHA).get(1, 0)
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@ -1,11 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue3879(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    assert len(doc) == 5
    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [pattern])
    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@ -1,21 +0,0 @@
 from spacy.lang.en import English
 import pytest
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3880():
    """Test that `nlp.pipe()` works when an empty string ends the batch.
    Fixed in v7.0.5 of Thinc.
    """
    texts = ["hello", "world", "", ""]
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("parser"))
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.get_pipe("parser").add_label("dep")
    nlp.get_pipe("ner").add_label("PERSON")
    nlp.get_pipe("tagger").add_label("NN")
    nlp.begin_training()
    for doc in nlp.pipe(texts):
        pass
--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@ -1,12 +0,0 @@
 from spacy.displacy import parse_deps
 from spacy.tokens import Doc
 def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.is_parsed = True
    doc.user_data["test"] = set()
    parse_deps(doc)
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -1,17 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue3951(en_vocab):
    """Test that combinations of optional rules are matched correctly."""
    matcher = Matcher(en_vocab)
    pattern = [
        {"LOWER": "hello"},
        {"LOWER": "this", "OP": "?"},
        {"OP": "?"},
        {"LOWER": "world"},
    ]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
    matches = matcher(doc)
    assert len(matches) == 0
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -1,26 +0,0 @@
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue3959():
    """ Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -1,117 +0,0 @@
 import pytest
 from ..util import get_doc
@pytest.fixture
 def doc(en_tokenizer):
    text = "He jests at scars, that never felt a wound."
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ccomp",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 def test_issue3962(doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    assert (
        doc2[0].head.text == "jests"
    )  # head set to itself, being the new artificial root
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"
    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1
    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json
    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1
@pytest.fixture
 def two_sent_doc(en_tokenizer):
    text = "He jests at scars. They never felt a wound."
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ROOT",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 def test_issue3962_long(two_sent_doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    assert (
        doc2[0].head.text == "jests"
    )  # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    assert (
        doc2[4].head.text == "They"
    )  # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].dep_ == "dep"
    assert (
        doc2[4].head.text == "They"
    )  # head set to the new artificial head (in sentence 2)
    assert doc2[4].dep_ == "dep"
    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -1,19 +0,0 @@
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 def test_issue3972(en_vocab):
    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
    matches = matcher(doc)
    assert len(matches) == 2
    # We should have a match for each of the two rules
    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
    assert "A" in found_ids
    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -0,0 +1,469 @@
 import pytest
 from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
 from spacy.pipeline.defaults import default_ner
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.gold import Example, Corpus
 from spacy.gold.converters import json2docs
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.util import minibatch, ensure_path, load_model
 from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
 from spacy.tokenizer import Tokenizer
 from spacy.lang.el import Greek
 from spacy.language import Language
 import spacy
 from thinc.api import compounding
 from collections import defaultdict
 from ..util import make_tempdir
 def test_issue4002(en_vocab):
    """Test that the PhraseMatcher can match on overwritten NORM attributes.
    """
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern1 = Doc(en_vocab, words=["c", "d"])
    assert [t.norm_ for t in pattern1] == ["c", "d"]
    matcher.add("TEST", [pattern1])
    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
    matches = matcher(doc)
    assert len(matches) == 1
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern2 = Doc(en_vocab, words=["1", "2"])
    pattern2[0].norm_ = "c"
    pattern2[1].norm_ = "d"
    assert [t.norm_ for t in pattern2] == ["c", "d"]
    matcher.add("TEST", [pattern2])
    matches = matcher(doc)
    assert len(matches) == 1
 def test_issue4030():
    """ Test whether textcat works fine with empty doc """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
    # processing of an empty doc should result in 0.0 for all categories
    doc = nlp("")
    assert doc.cats["offensive"] == 0.0
    assert doc.cats["inoffensive"] == 0.0
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4042():
    """Test that serialization of an EntityRuler before NER works fine."""
    nlp = English()
    # add ner pipe
    ner = nlp.create_pipe("ner")
    ner.add_label("SOME_LABEL")
    nlp.add_pipe(ner)
    nlp.begin_training()
    # Add entity ruler
    ruler = EntityRuler(nlp)
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
    doc1 = nlp("What do you think about Apple ?")
    assert doc1.ents[0].label_ == "MY_ORG"
    with make_tempdir() as d:
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        nlp2 = load_model(output_dir)
        doc2 = nlp2("What do you think about Apple ?")
        assert doc2.ents[0].label_ == "MY_ORG"
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab
    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        config = {
            "learn_tokens": False,
            "min_action_freq": 30,
            "beam_width": 1,
            "beam_update_prob": 1.0,
        }
        ner2 = EntityRecognizer(vocab, default_ner(), **config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
 def test_issue4054(en_vocab):
    """Test that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point."""
    nlp1 = English()
    vocab1 = nlp1.vocab
    with make_tempdir() as d:
        vocab_dir = ensure_path(d / "vocab")
        if not vocab_dir.exists():
            vocab_dir.mkdir()
        vocab1.to_disk(vocab_dir)
        vocab2 = Vocab().from_disk(vocab_dir)
        print("lang", vocab2.lang)
        nlp2 = spacy.blank("en", vocab=vocab2)
        nlp_dir = ensure_path(d / "nlp")
        if not nlp_dir.exists():
            nlp_dir.mkdir()
        nlp2.to_disk(nlp_dir)
        nlp3 = load_model(nlp_dir)
        assert nlp3.lang == "en"
 def test_issue4120(en_vocab):
    """Test that matches without a final {OP: ?} token are returned."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
    doc1 = Doc(en_vocab, words=["a"])
    assert len(matcher(doc1)) == 1  # works
    doc2 = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc2)) == 2  # fixed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc3)) == 2  # works
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc4)) == 3  # fixed
 def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)
    actual = []
    for token in doc:
        actual.append(token.pos_)
    assert actual == pos
 def test_issue4190():
    def customize_tokenizer(nlp):
        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
        infix_re = compile_infix_regex(nlp.Defaults.infixes)
        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
        exceptions = {
            k: v
            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
            if not (len(k) == 2 and k[1] == ".")
        }
        new_tokenizer = Tokenizer(
            nlp.vocab,
            exceptions,
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
            token_match=nlp.tokenizer.token_match,
        )
        nlp.tokenizer = new_tokenizer
    test_string = "Test c."
    # Load default language
    nlp_1 = English()
    doc_1a = nlp_1(test_string)
    result_1a = [token.text for token in doc_1a]  # noqa: F841
    # Modify tokenizer
    customize_tokenizer(nlp_1)
    doc_1b = nlp_1(test_string)
    result_1b = [token.text for token in doc_1b]
    # Save and Reload
    with make_tempdir() as model_dir:
        nlp_1.to_disk(model_dir)
        nlp_2 = load_model(model_dir)
    # This should be the modified tokenizer
    doc_2 = nlp_2(test_string)
    result_2 = [token.text for token in doc_2]
    assert result_1b == result_2
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4267():
    """ Test that running an entity_ruler after ner gives consistent results"""
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PEOPLE")
    nlp.add_pipe(ner)
    nlp.begin_training()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
    assert doc1.is_nered
    for token in doc1:
        assert token.ent_iob == 2
    # add entity ruler and run again
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    assert "entity_ruler" in nlp.pipe_names
    assert "ner" in nlp.pipe_names
    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
    assert doc2.is_nered
    for token in doc2:
        assert token.ent_iob == 2
 def test_issue4272():
    """Test that lookup table can be accessed from Token.lemma if no POS tags
    are available."""
    nlp = Greek()
    doc = nlp("Χθες")
    assert doc[0].lemma_
 def test_multiple_predictions():
    class DummyPipe(Pipe):
        def __init__(self):
            self.model = "dummy_model"
        def predict(self, docs):
            return ([1, 2, 3], [4, 5, 6])
        def set_annotations(self, docs, scores, tensors=None):
            return docs
    nlp = Language()
    doc = nlp.make_doc("foo")
    dummy_pipe = DummyPipe()
    dummy_pipe(doc)
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
 def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
    nlp.add_pipe(ner)
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]
    # ensure the beam_parse still works with the new label
    docs = [doc]
    beams = nlp.entity.beam_parse(
        docs, beam_width=beam_width, beam_density=beam_density
    )
    for doc, beam in zip(docs, beams):
        entity_scores = defaultdict(float)
        for score, ents in nlp.entity.moves.get_beam_parses(beam):
            for start, end, label in ents:
                entity_scores[(start, end, label)] += score
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4348():
    """Test that training the tagger with empty data, doesn't throw errors"""
    nlp = English()
    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
    TRAIN_DATA = [example, example]
    tagger = nlp.create_pipe("tagger")
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)
 def test_issue4367():
    """Test that docbin init goes well"""
    DocBin()
    DocBin(attrs=["LEMMA"])
    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
 def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
 def test_issue4402():
    json_data = {
        "id": 0,
        "paragraphs": [
            {
                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "How", "ner": "O"},
                            {"id": 1, "orth": "should", "ner": "O"},
                            {"id": 2, "orth": "I", "ner": "O"},
                            {"id": 3, "orth": "cook", "ner": "O"},
                            {"id": 4, "orth": "bacon", "ner": "O"},
                            {"id": 5, "orth": "in", "ner": "O"},
                            {"id": 6, "orth": "an", "ner": "O"},
                            {"id": 7, "orth": "oven", "ner": "O"},
                            {"id": 8, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {
                        "tokens": [
                            {"id": 9, "orth": "\n", "ner": "O"},
                            {"id": 10, "orth": "I", "ner": "O"},
                            {"id": 11, "orth": "'ve", "ner": "O"},
                            {"id": 12, "orth": "heard", "ner": "O"},
                            {"id": 13, "orth": "of", "ner": "O"},
                            {"id": 14, "orth": "people", "ner": "O"},
                            {"id": 15, "orth": "cooking", "ner": "O"},
                            {"id": 16, "orth": "bacon", "ner": "O"},
                            {"id": 17, "orth": "in", "ner": "O"},
                            {"id": 18, "orth": "an", "ner": "O"},
                            {"id": 19, "orth": "oven", "ner": "O"},
                            {"id": 20, "orth": ".", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                ],
                "cats": [
                    {"label": "baking", "value": 1.0},
                    {"label": "not_baking", "value": 0.0},
                ],
            },
            {
                "raw": "What is the difference between white and brown eggs?\n",
                "sentences": [
                    {
                        "tokens": [
                            {"id": 0, "orth": "What", "ner": "O"},
                            {"id": 1, "orth": "is", "ner": "O"},
                            {"id": 2, "orth": "the", "ner": "O"},
                            {"id": 3, "orth": "difference", "ner": "O"},
                            {"id": 4, "orth": "between", "ner": "O"},
                            {"id": 5, "orth": "white", "ner": "O"},
                            {"id": 6, "orth": "and", "ner": "O"},
                            {"id": 7, "orth": "brown", "ner": "O"},
                            {"id": 8, "orth": "eggs", "ner": "O"},
                            {"id": 9, "orth": "?", "ner": "O"},
                        ],
                        "brackets": [],
                    },
                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
                ],
                "cats": [
                    {"label": "baking", "value": 0.0},
                    {"label": "not_baking", "value": 1.0},
                ],
            },
        ],
    }
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json2docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
        train_data = list(corpus.train_dataset(nlp))
        assert len(train_data) == 2
        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@ -1,23 +0,0 @@
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 def test_issue4002(en_vocab):
    """Test that the PhraseMatcher can match on overwritten NORM attributes.
    """
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern1 = Doc(en_vocab, words=["c", "d"])
    assert [t.norm_ for t in pattern1] == ["c", "d"]
    matcher.add("TEST", [pattern1])
    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
    matches = matcher(doc)
    assert len(matches) == 1
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern2 = Doc(en_vocab, words=["1", "2"])
    pattern2[0].norm_ = "c"
    pattern2[1].norm_ = "d"
    assert [t.norm_ for t in pattern2] == ["c", "d"]
    matcher.add("TEST", [pattern2])
    matches = matcher(doc)
    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@ -1,50 +0,0 @@
 import spacy
 from spacy.util import minibatch
 from thinc.api import compounding
 from spacy.gold import Example
 def test_issue4030():
    """ Test whether textcat works fine with empty doc """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    textcat = nlp.create_pipe(
        "textcat",
        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )
    for label in unique_classes:
        textcat.add_label(label)
    nlp.add_pipe(textcat, last=True)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )
    # processing of an empty doc should result in 0.0 for all categories
    doc = nlp("")
    assert doc.cats["offensive"] == 0.0
    assert doc.cats["inoffensive"] == 0.0
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -1,85 +0,0 @@
 import spacy
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.lang.en import English
 from spacy.tokens import Span
 from spacy.util import ensure_path
 from spacy.pipeline.defaults import default_ner
 from ..util import make_tempdir
 def test_issue4042():
    """Test that serialization of an EntityRuler before NER works fine."""
    nlp = English()
    # add ner pipe
    ner = nlp.create_pipe("ner")
    ner.add_label("SOME_LABEL")
    nlp.add_pipe(ner)
    nlp.begin_training()
    # Add entity ruler
    ruler = EntityRuler(nlp)
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
    doc1 = nlp("What do you think about Apple ?")
    assert doc1.ents[0].label_ == "MY_ORG"
    with make_tempdir() as d:
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2("What do you think about Apple ?")
        assert doc2.ents[0].label_ == "MY_ORG"
 def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab
    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        config = {
            "learn_tokens": False,
            "min_action_freq": 30,
            "beam_width": 1,
            "beam_update_prob": 1.0,
        }
        ner2 = EntityRecognizer(vocab, default_ner(), **config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@ -1,30 +0,0 @@
 from spacy.vocab import Vocab
 import spacy
 from spacy.lang.en import English
 from spacy.util import ensure_path
 from ..util import make_tempdir
 def test_issue4054(en_vocab):
    """Test that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point."""
    nlp1 = English()
    vocab1 = nlp1.vocab
    with make_tempdir() as d:
        vocab_dir = ensure_path(d / "vocab")
        if not vocab_dir.exists():
            vocab_dir.mkdir()
        vocab1.to_disk(vocab_dir)
        vocab2 = Vocab().from_disk(vocab_dir)
        print("lang", vocab2.lang)
        nlp2 = spacy.blank("en", vocab=vocab2)
        nlp_dir = ensure_path(d / "nlp")
        if not nlp_dir.exists():
            nlp_dir.mkdir()
        nlp2.to_disk(nlp_dir)
        nlp3 = spacy.load(nlp_dir)
        assert nlp3.lang == "en"
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@ -1,23 +0,0 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 def test_issue4120(en_vocab):
    """Test that matches without a final {OP: ?} token are returned."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
    doc1 = Doc(en_vocab, words=["a"])
    assert len(matcher(doc1)) == 1  # works
    doc2 = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc2)) == 2  # fixed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc3)) == 2  # works
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc4)) == 3  # fixed
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -1,28 +0,0 @@
 from spacy.lang.en import English
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)
    actual = []
    for token in doc:
        actual.append(token.pos_)
    assert actual == pos
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -1,46 +0,0 @@
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy import util
 from ..util import make_tempdir
 def test_issue4190():
    test_string = "Test c."
    # Load default language
    nlp_1 = English()
    doc_1a = nlp_1(test_string)
    result_1a = [token.text for token in doc_1a]  # noqa: F841
    # Modify tokenizer
    customize_tokenizer(nlp_1)
    doc_1b = nlp_1(test_string)
    result_1b = [token.text for token in doc_1b]
    # Save and Reload
    with make_tempdir() as model_dir:
        nlp_1.to_disk(model_dir)
        nlp_2 = util.load_model(model_dir)
    # This should be the modified tokenizer
    doc_2 = nlp_2(test_string)
    result_2 = [token.text for token in doc_2]
    assert result_1b == result_2
 def customize_tokenizer(nlp):
    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
    exceptions = {
        k: v
        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
        if not (len(k) == 2 and k[1] == ".")
    }
    new_tokenizer = Tokenizer(
        nlp.vocab,
        exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -1,34 +0,0 @@
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 def test_issue4267():
    """ Test that running an entity_ruler after ner gives consistent results"""
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PEOPLE")
    nlp.add_pipe(ner)
    nlp.begin_training()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
    assert doc1.is_nered
    for token in doc1:
        assert token.ent_iob == 2
    # add entity ruler and run again
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    assert "entity_ruler" in nlp.pipe_names
    assert "ner" in nlp.pipe_names
    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
    assert doc2.is_nered
    for token in doc2:
        assert token.ent_iob == 2
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@ -1,9 +0,0 @@
 from spacy.lang.el import Greek
 def test_issue4272():
    """Test that lookup table can be accessed from Token.lemma if no POS tags
    are available."""
    nlp = Greek()
    doc = nlp("Χθες")
    assert doc[0].lemma_
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@ -1,25 +0,0 @@
 import pytest
 from spacy.language import Language
 from spacy.pipeline import Pipe
 class DummyPipe(Pipe):
    def __init__(self):
        self.model = "dummy_model"
    def predict(self, docs):
        return ([1, 2, 3], [4, 5, 6])
    def set_annotations(self, docs, scores, tensors=None):
        return docs
@pytest.fixture
 def nlp():
    return Language()
 def test_multiple_predictions(nlp):
    doc = nlp.make_doc("foo")
    dummy_pipe = DummyPipe()
    dummy_pipe(doc)
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -1,47 +0,0 @@
 from collections import defaultdict
 import pytest
 from spacy.pipeline.defaults import default_ner
 from spacy.pipeline import EntityRecognizer
 from spacy.lang.en import English
 from spacy.tokens import Span
 # skipped after removing Beam stuff during the Example/GoldParse refactor
@pytest.mark.skip
 def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "beam_width": 1,
        "beam_update_prob": 1.0,
    }
    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
    nlp.add_pipe(ner)
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]
    # ensure the beam_parse still works with the new label
    docs = [doc]
    beams = nlp.entity.beam_parse(
        docs, beam_width=beam_width, beam_density=beam_density
    )
    for doc, beam in zip(docs, beams):
        entity_scores = defaultdict(float)
        for score, ents in nlp.entity.moves.get_beam_parses(beam):
            for start, end, label in ents:
                entity_scores[(start, end, label)] += score
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@ -1,24 +0,0 @@
 from spacy.gold import Example
 from spacy.lang.en import English
 from spacy.util import minibatch
 from thinc.api import compounding
 import pytest
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4348():
    """Test that training the tagger with empty data, doesn't throw errors"""
    nlp = English()
    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
    TRAIN_DATA = [example, example]
    tagger = nlp.create_pipe("tagger")
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@ -1,8 +0,0 @@
 from spacy.tokens import DocBin
 def test_issue4367():
    """Test that docbin init goes well"""
    DocBin()
    DocBin(attrs=["LEMMA"])
    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
--- a/spacy/tests/regression/test_issue4373.py
+++ b/spacy/tests/regression/test_issue4373.py
@ -1,10 +0,0 @@
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.vocab import Vocab
 def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@ -1,98 +0,0 @@
 from spacy.gold import Corpus
 from spacy.lang.en import English
 from ..util import make_tempdir
 from ...gold.converters import json2docs
 from ...tokens import DocBin
 def test_issue4402():
    nlp = English()
    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
        docs = json2docs([json_data])
        data = DocBin(docs=docs, attrs=attrs).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
        train_data = list(corpus.train_dataset(nlp))
        assert len(train_data) == 2
        split_train_data = []
        for eg in train_data:
            split_train_data.extend(eg.split_sents())
        assert len(split_train_data) == 4
 json_data = {
    "id": 0,
    "paragraphs": [
        {
            "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
            "sentences": [
                {
                    "tokens": [
                        {"id": 0, "orth": "How", "ner": "O"},
                        {"id": 1, "orth": "should", "ner": "O"},
                        {"id": 2, "orth": "I", "ner": "O"},
                        {"id": 3, "orth": "cook", "ner": "O"},
                        {"id": 4, "orth": "bacon", "ner": "O"},
                        {"id": 5, "orth": "in", "ner": "O"},
                        {"id": 6, "orth": "an", "ner": "O"},
                        {"id": 7, "orth": "oven", "ner": "O"},
                        {"id": 8, "orth": "?", "ner": "O"},
                    ],
                    "brackets": [],
                },
                {
                    "tokens": [
                        {"id": 9, "orth": "\n", "ner": "O"},
                        {"id": 10, "orth": "I", "ner": "O"},
                        {"id": 11, "orth": "'ve", "ner": "O"},
                        {"id": 12, "orth": "heard", "ner": "O"},
                        {"id": 13, "orth": "of", "ner": "O"},
                        {"id": 14, "orth": "people", "ner": "O"},
                        {"id": 15, "orth": "cooking", "ner": "O"},
                        {"id": 16, "orth": "bacon", "ner": "O"},
                        {"id": 17, "orth": "in", "ner": "O"},
                        {"id": 18, "orth": "an", "ner": "O"},
                        {"id": 19, "orth": "oven", "ner": "O"},
                        {"id": 20, "orth": ".", "ner": "O"},
                    ],
                    "brackets": [],
                },
            ],
            "cats": [
                {"label": "baking", "value": 1.0},
                {"label": "not_baking", "value": 0.0},
            ],
        },
        {
            "raw": "What is the difference between white and brown eggs?\n",
            "sentences": [
                {
                    "tokens": [
                        {"id": 0, "orth": "What", "ner": "O"},
                        {"id": 1, "orth": "is", "ner": "O"},
                        {"id": 2, "orth": "the", "ner": "O"},
                        {"id": 3, "orth": "difference", "ner": "O"},
                        {"id": 4, "orth": "between", "ner": "O"},
                        {"id": 5, "orth": "white", "ner": "O"},
                        {"id": 6, "orth": "and", "ner": "O"},
                        {"id": 7, "orth": "brown", "ner": "O"},
                        {"id": 8, "orth": "eggs", "ner": "O"},
                        {"id": 9, "orth": "?", "ner": "O"},
                    ],
                    "brackets": [],
                },
                {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
            ],
            "cats": [
                {"label": "baking", "value": 0.0},
                {"label": "not_baking", "value": 1.0},
            ],
        },
    ],
 }
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -0,0 +1,288 @@
 import pytest
 from mock import Mock
 from spacy.pipeline import EntityRuler
 from spacy.matcher import DependencyMatcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.gold import Example
 from spacy.gold.converters.conllu2docs import conllu2docs
 from spacy.lang.en import English
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
 from spacy.language import Language
 from spacy.util import ensure_path, load_model_from_path
 import numpy
 import pickle
 from ..util import get_doc, make_tempdir
 def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
@pytest.mark.parametrize(
    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
 )
 def test_gold_misaligned(en_tokenizer, text, words):
    doc = en_tokenizer(text)
    Example.from_dict(doc, {"words": words})
 def test_issue4590(en_vocab):
    """Test that matches param in on_match method are the same as matches run with no on_match method"""
    pattern = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
    ]
    on_match = Mock()
    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern", on_match, pattern)
    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
    matches = matcher(doc)
    on_match_args = on_match.call_args
    assert on_match_args[0][3] == matches
 def test_issue4651_with_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
 def test_issue4651_without_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    not specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
 def test_issue4665():
    """
    conllu2json should not raise an exception if the HEAD column contains an
    underscore
    """
    input_data = """
 1	[	_	PUNCT	-LRB-	_	_	punct	_	_
 2	This	_	DET	DT	_	_	det	_	_
 3	killing	_	NOUN	NN	_	_	nsubj	_	_
 4	of	_	ADP	IN	_	_	case	_	_
 5	a	_	DET	DT	_	_	det	_	_
 6	respected	_	ADJ	JJ	_	_	amod	_	_
 7	cleric	_	NOUN	NN	_	_	nmod	_	_
 8	will	_	AUX	MD	_	_	aux	_	_
 9	be	_	AUX	VB	_	_	aux	_	_
 10	causing	_	VERB	VBG	_	_	root	_	_
 11	us	_	PRON	PRP	_	_	iobj	_	_
 12	trouble	_	NOUN	NN	_	_	dobj	_	_
 13	for	_	ADP	IN	_	_	case	_	_
 14	years	_	NOUN	NNS	_	_	nmod	_	_
 15	to	_	PART	TO	_	_	mark	_	_
 16	come	_	VERB	VB	_	_	acl	_	_
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
    conllu2docs(input_data)
 def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
        kb.set_entities(
            entity_list=["Q1", "Q1"],
            freq_list=[32, 111],
            vector_list=[vector1, vector2],
        )
    assert kb.get_size_entities() == 1
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))
    assert kb2.get_size_entities() == 1
 def test_issue4707():
    """Tests that disabled component names are also excluded from nlp.from_disk
    by default when loading a model.
    """
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
    exclude = ["tokenizer", "sentencizer"]
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir, exclude=exclude)
        new_nlp = load_model_from_path(tmpdir, disable=exclude)
    assert "sentencizer" not in new_nlp.pipe_names
    assert "entity_ruler" in new_nlp.pipe_names
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4725_1():
    """ Ensure the pickling of the NER goes well"""
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
    with make_tempdir() as tmp_path:
        with (tmp_path / "ner.pkl").open("wb") as file_:
            pickle.dump(ner, file_)
            assert ner.cfg["min_action_freq"] == 342
        with (tmp_path / "ner.pkl").open("rb") as file_:
            ner2 = pickle.load(file_)
            assert ner2.cfg["min_action_freq"] == 342
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue4725_2():
    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    nlp.begin_training()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
 def test_issue4849():
    nlp = English()
    ruler = EntityRuler(
        nlp,
        patterns=[
            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
        ],
        phrase_matcher_attr="LOWER",
    )
    nlp.add_pipe(ruler)
    text = """
    The left is starting to take aim at Democratic front-runner Joe Biden.
    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
    """
    # USING 1 PROCESS
    count_ents = 0
    for doc in nlp.pipe([text], n_process=1):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
    # USING 2 PROCESSES
    count_ents = 0
    for doc in nlp.pipe([text], n_process=2):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
 class CustomPipe:
    name = "my_pipe"
    def __init__(self):
        Span.set_extension("my_ext", getter=self._get_my_ext)
        Doc.set_extension("my_ext", default=None)
    def __call__(self, doc):
        gathered_ext = []
        for sent in doc.sents:
            sent_ext = self._get_my_ext(sent)
            sent._.set("my_ext", sent_ext)
            gathered_ext.append(sent_ext)
        doc._.set("my_ext", "\n".join(gathered_ext))
        return doc
    @staticmethod
    def _get_my_ext(span):
        return str(span.end)
 def test_issue4903():
    """Ensure that this runs correctly and doesn't hang or crash on Windows /
    macOS."""
    nlp = English()
    custom_component = CustomPipe()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(custom_component, after="sentencizer")
    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
    docs = list(nlp.pipe(text, n_process=2))
    assert docs[0].text == "I like bananas."
    assert docs[1].text == "Do you like them?"
    assert docs[2].text == "No, I prefer wasabi."
 def test_issue4924():
    nlp = Language()
    example = Example.from_dict(nlp.make_doc(""), {})
    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue4528.py
+++ b/spacy/tests/regression/test_issue4528.py
@ -1,16 +0,0 @@
 from spacy.tokens import Doc, DocBin
 def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -1,11 +0,0 @@
 import pytest
 from spacy.gold import Example
@pytest.mark.parametrize(
    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
 )
 def test_gold_misaligned(en_tokenizer, text, words):
    doc = en_tokenizer(text)
    Example.from_dict(doc, {"words": words})
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@ -1,35 +0,0 @@
 from mock import Mock
 from spacy.matcher import DependencyMatcher
 from ..util import get_doc
 def test_issue4590(en_vocab):
    """Test that matches param in on_match method are the same as matches run with no on_match method"""
    pattern = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
    ]
    on_match = Mock()
    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern", on_match, pattern)
    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
    matches = matcher(doc)
    on_match_args = on_match.call_args
    assert on_match_args[0][3] == matches
--- a/spacy/tests/regression/test_issue4651.py
+++ b/spacy/tests/regression/test_issue4651.py
@ -1,62 +0,0 @@
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 from ..util import make_tempdir
 def test_issue4651_with_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
 def test_issue4651_without_phrase_matcher_attr():
    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    not specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
    nlp_reloaded = English()
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
    nlp_reloaded.add_pipe(ruler_reloaded)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
--- a/spacy/tests/regression/test_issue4665.py
+++ b/spacy/tests/regression/test_issue4665.py
@ -1,35 +0,0 @@
 import pytest
 # TODO
 # from spacy.gold.converters.conllu2docs import conllu2docs
 input_data = """
 1	[	_	PUNCT	-LRB-	_	_	punct	_	_
 2	This	_	DET	DT	_	_	det	_	_
 3	killing	_	NOUN	NN	_	_	nsubj	_	_
 4	of	_	ADP	IN	_	_	case	_	_
 5	a	_	DET	DT	_	_	det	_	_
 6	respected	_	ADJ	JJ	_	_	amod	_	_
 7	cleric	_	NOUN	NN	_	_	nmod	_	_
 8	will	_	AUX	MD	_	_	aux	_	_
 9	be	_	AUX	VB	_	_	aux	_	_
 10	causing	_	VERB	VBG	_	_	root	_	_
 11	us	_	PRON	PRP	_	_	iobj	_	_
 12	trouble	_	NOUN	NN	_	_	dobj	_	_
 13	for	_	ADP	IN	_	_	case	_	_
 14	years	_	NOUN	NNS	_	_	nmod	_	_
 15	to	_	PART	TO	_	_	mark	_	_
 16	come	_	VERB	VB	_	_	acl	_	_
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
@pytest.mark.xfail
 def test_issue4665():
    """
    conllu2json should not raise an exception if the HEAD column contains an
    underscore
    """
    pass
    # conllu2json(input_data)
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@ -1,36 +0,0 @@
 import pytest
 from spacy.kb import KnowledgeBase
 from spacy.util import ensure_path
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
        kb.set_entities(
            entity_list=["Q1", "Q1"],
            freq_list=[32, 111],
            vector_list=[vector1, vector2],
        )
    assert kb.get_size_entities() == 1
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))
    assert kb2.get_size_entities() == 1
--- a/spacy/tests/regression/test_issue4707.py
+++ b/spacy/tests/regression/test_issue4707.py
@ -1,20 +0,0 @@
 from spacy.util import load_model_from_path
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue4707():
    """Tests that disabled component names are also excluded from nlp.from_disk
    by default when loading a model.
    """
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
    exclude = ["tokenizer", "sentencizer"]
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir, exclude=exclude)
        new_nlp = load_model_from_path(tmpdir, disable=exclude)
    assert "sentencizer" not in new_nlp.pipe_names
    assert "entity_ruler" in new_nlp.pipe_names
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@ -1,41 +0,0 @@
 import pickle
 import numpy
 from spacy.lang.en import English
 from spacy.vocab import Vocab
 from spacy.tests.util import make_tempdir
 def test_pickle_ner():
    """ Ensure the pickling of the NER goes well"""
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
    with make_tempdir() as tmp_path:
        with (tmp_path / "ner.pkl").open("wb") as file_:
            pickle.dump(ner, file_)
            assert ner.cfg["min_action_freq"] == 342
        with (tmp_path / "ner.pkl").open("rb") as file_:
            ner2 = pickle.load(file_)
            assert ner2.cfg["min_action_freq"] == 342
 def test_issue4725():
    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    nlp.begin_training()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -1,34 +0,0 @@
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 def test_issue4849():
    nlp = English()
    ruler = EntityRuler(
        nlp,
        patterns=[
            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
        ],
        phrase_matcher_attr="LOWER",
    )
    nlp.add_pipe(ruler)
    text = """
    The left is starting to take aim at Democratic front-runner Joe Biden.
    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
    """
    # USING 1 PROCESS
    count_ents = 0
    for doc in nlp.pipe([text], n_process=1):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
    # USING 2 PROCESSES
    count_ents = 0
    for doc in nlp.pipe([text], n_process=2):
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@ -1,40 +0,0 @@
 from spacy.lang.en import English
 from spacy.tokens import Span, Doc
 class CustomPipe:
    name = "my_pipe"
    def __init__(self):
        Span.set_extension("my_ext", getter=self._get_my_ext)
        Doc.set_extension("my_ext", default=None)
    def __call__(self, doc):
        gathered_ext = []
        for sent in doc.sents:
            sent_ext = self._get_my_ext(sent)
            sent._.set("my_ext", sent_ext)
            gathered_ext.append(sent_ext)
        doc._.set("my_ext", "\n".join(gathered_ext))
        return doc
    @staticmethod
    def _get_my_ext(span):
        return str(span.end)
 def test_issue4903():
    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
    nlp = English()
    custom_component = CustomPipe()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(custom_component, after="sentencizer")
    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
    docs = list(nlp.pipe(text, n_process=2))
    assert docs[0].text == "I like bananas."
    assert docs[1].text == "Do you like them?"
    assert docs[2].text == "No, I prefer wasabi."
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@ -1,8 +0,0 @@
 from spacy.gold import Example
 from spacy.language import Language
 def test_issue4924():
    nlp = Language()
    example = Example.from_dict(nlp.make_doc(""), {})
    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,8 @@
 import pytest
 from spacy.lang.en import English
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue5152():
    # Test that the comparison between a Span and a Token, goes well
    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -8,7 +10,6 @@ def test_issue5152():
    text = nlp("Talk about being boring!")
    text_var = nlp("Talk of being boring!")
    y = nlp("Let")
    span = text[0:3]  # Talk about being
    span_2 = text[0:3]  # Talk about being
    span_3 = text_var[0:3]  # Talk of being
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -63,7 +63,8 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    with pytest.warns(UserWarning):
        tagger.begin_training(pipeline=nlp.pipeline)
    return tagger
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -1,10 +1,11 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
+from spacy.gold import spans_from_biluo_tags, iob_to_biluo
 from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.gold.converters import json2docs
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
@ -271,75 +272,76 @@ def test_split_sentences(en_vocab):
    assert split_examples[1].text == "had loads of fun "
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
-    words = ["I", "flew to", "San Francisco Valley", "."]
+    words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
-    spaces = [True, True, False, False]
+    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
+    prefix = "Mr and Mrs Smith flew to "
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
    gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "U-LOC", "O"]
+    assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
    entities = [
-        (len("I "), len("I flew to"), "ORG"),
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
+    assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
    entities = [
-        (len("I "), len("I flew"), "ORG"),
+        (len("Mr and "), len("Mr and Mrs"), "PERSON"),  # "Mrs" is a Person
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", None, "U-LOC", "O"]
+    assert ner_tags == ["O", None, "O", "U-LOC", "O"]
 def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
-    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    prefix = "Mr and Mrs Smith flew to "
    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
    gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
    entities = [
        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
    gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
    words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
+    prefix = "Mr and Mrs Smith flew to "
-    gold_words = ["I", "flew to", "San Francisco Valley", "."]
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
    gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
    entities = [
-        (len("I "), len("I flew to"), "ORG"),
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["I", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
+    assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
    words = ["I flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
    entities = [
        (len("I "), len("I flew to"), "ORG"),
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
    ]
    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
 def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
@ -349,7 +351,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
        "I flew  to San Francisco Valley.",
    )
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew  to "), len("I flew  to San Francisco Valley"), "LOC")]
+    prefix = "I flew  to "
    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
    gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
    gold_spaces = [True, True, False, True, False, False]
    example = Example.from_dict(
@ -405,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
    assert spans[1].label_ == "GPE"
 def test_aligned_spans_y2x(en_vocab, en_tokenizer):
    words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    prefix = "Mr and Mrs Smith flew to "
    entities = [
        (0, len("Mr and Mrs Smith"), "PERSON"),
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
    tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
    ents_ref = example.reference.ents
    assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
    ents_y2x = example.get_aligned_spans_y2x(ents_ref)
    assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
 def test_aligned_spans_x2y(en_vocab, en_tokenizer):
    text = "Mr and Mrs Smith flew to San Francisco Valley"
    nlp = English()
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
                {"label": "LOC", "pattern": "San Francisco Valley"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)
    assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
    prefix = "Mr and Mrs Smith flew to "
    entities = [
        (0, len("Mr and Mrs Smith"), "PERSON"),
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
    tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
    assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
    # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
    ents_pred = example.predicted.ents
    assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
    ents_x2y = example.get_aligned_spans_x2y(ents_pred)
    assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
 def test_gold_ner_missing_tags(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@ -412,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
    assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
 def test_projectivize(en_tokenizer):
    doc = en_tokenizer("He pretty quickly walks away")
    heads = [3, 2, 3, 0, 2]
    example = Example.from_dict(doc, {"heads": heads})
    proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
    nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
    assert proj_heads == [3, 2, 3, 0, 3]
    assert nonproj_heads == [3, 2, 3, 0, 2]
 def test_iob_to_biluo():
    good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
    good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
@ -514,6 +570,7 @@ def test_make_orth_variants(doc):
        make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.skip("Outdated")
@pytest.mark.parametrize(
    "tokens_a,tokens_b,expected",
    [
@ -537,12 +594,12 @@ def test_make_orth_variants(doc):
        ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
    ],
 )
-def test_align(tokens_a, tokens_b, expected):
+def test_align(tokens_a, tokens_b, expected):  # noqa
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
+    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa
-    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
+    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa
    # check symmetry
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
+    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa
-    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
+    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa
 def test_goldparse_startswith_space(en_tokenizer):
@ -556,7 +613,7 @@ def test_goldparse_startswith_space(en_tokenizer):
        doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
    )
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == [None, "U-DATE"]
+    assert ner_tags == ["O", "U-DATE"]
    assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@ -55,7 +55,7 @@ def test_aligned_tags():
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    aligned_tags = example.get_aligned("tag", as_string=True)
-    assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
+    assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
 def test_aligned_tags_multi():
--- a/spacy/tests/test_projects.py
+++ b/spacy/tests/test_projects.py
@ -0,0 +1,31 @@
 import pytest
 from spacy.cli.project.util import validate_project_commands
 from spacy.schemas import ProjectConfigSchema, validate
@pytest.mark.parametrize(
    "config",
    [
        {"commands": [{"name": "a"}, {"name": "a"}]},
        {"commands": [{"name": "a"}], "workflows": {"a": []}},
        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
    ],
 )
 def test_project_config_validation1(config):
    with pytest.raises(SystemExit):
        validate_project_commands(config)
@pytest.mark.parametrize(
    "config,n_errors",
    [
        ({"commands": {"a": []}}, 1),
        ({"commands": [{"help": "..."}]}, 1),
        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
        ({"commands": [{"extra": "b"}]}, 2),
        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
    ],
 )
 def test_project_config_validation2(config, n_errors):
    errors = validate(ProjectConfigSchema, config)
    assert len(errors) == n_errors
--- a/spacy/util.py
+++ b/spacy/util.py
@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]:
    return shlex.split(command, posix=not is_windows)
 def join_command(command: List[str]) -> str:
    """Join a command using shlex. shlex.join is only available for Python 3.8+,
    so we're using a workaround here.
    command (List[str]): The command to join.
    RETURNS (str): The joined command
    """
    return " ".join(shlex.quote(cmd) for cmd in command)
 def run_command(command: Union[str, List[str]]) -> None:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str:
    return hashlib.md5(Path(path).read_bytes()).hexdigest()
 def is_cwd(path: Union[Path, str]) -> bool:
    """Check whether a path is the current working directory.
    path (Union[Path, str]): The directory path.
    RETURNS (bool): Whether the path is the current working directory.
    """
    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
 def is_in_jupyter():
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.