diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg index 48fe25a67..228289128 100644 --- a/examples/experiments/onto-ner.cfg +++ b/examples/experiments/onto-ner.cfg @@ -9,27 +9,28 @@ max_length = 5000 limit = 0 # Data augmentation orth_variant_level = 0.0 -dropout = 0.2 +dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. -patience = 1600 +patience = 100000 max_epochs = 0 -max_steps = 20000 -eval_frequency = 500 +max_steps = 0 +eval_frequency = 1000 # Other settings seed = 0 -accumulate_gradient = 1 +accumulate_gradient = 2 use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. scores = ["speed", "ents_p", "ents_r", "ents_f"] score_weights = {"ents_f": 1.0} # These settings are invalid for the transformer models. init_tok2vec = null -discard_oversize = false +discard_oversize = true omit_extra_lookups = false +batch_by_words = true [training.batch_size] @schedules = "compounding.v1" -start = 100 +start = 1000 stop = 1000 compound = 1.001 @@ -37,18 +38,18 @@ compound = 1.001 @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 -L2_is_weight_decay = false -L2 = 1e-6 +L2_is_weight_decay = true +L2 = 0.01 grad_clip = 1.0 use_averages = true eps = 1e-8 learn_rate = 0.001 -#[optimizer.learn_rate] +#[training.optimizer.learn_rate] #@schedules = "warmup_linear.v1" -#warmup_steps = 250 -#total_steps = 20000 -#initial_rate = 0.001 +#warmup_steps = 1000 +#total_steps = 50000 +#initial_rate = 0.003 [nlp] lang = "en" @@ -58,8 +59,6 @@ vectors = null factory = "ner" learn_tokens = false min_action_freq = 1 -beam_width = 1 -beam_update_prob = 1.0 [nlp.pipeline.ner.model] @architectures = "spacy.TransitionBasedParser.v1" @@ -75,6 +74,6 @@ width = 96 depth = 4 window_size = 1 embed_size = 2000 -maxout_pieces = 3 +maxout_pieces = 1 subword_features = true dropout = ${training:dropout} diff --git a/pyproject.toml b/pyproject.toml index 480c3290e..2c020ef66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires = [ "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0a12,<8.0.0a20", - "blis>=0.4.0,<0.5.0" + "blis>=0.4.0,<0.5.0", + "pytokenizations" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index cd123e341..3e1329de9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.3.0,<2.0.0 +pytokenizations # Official Python utilities setuptools packaging diff --git a/setup.cfg b/setup.cfg index 43a74d97e..9793bbb08 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = numpy>=1.15.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 + pytokenizations # Official Python utilities setuptools packaging diff --git a/setup.py b/setup.py index 731a19cba..3b43ca2d2 100755 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ #!/usr/bin/env python +from setuptools import Extension, setup, find_packages import sys import platform from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc import distutils.util from distutils import ccompiler, msvccompiler -from setuptools import Extension, setup, find_packages import numpy from pathlib import Path import shutil @@ -23,7 +23,6 @@ Options.docstrings = True PACKAGES = find_packages() MOD_NAMES = [ - "spacy.gold.align", "spacy.gold.example", "spacy.parts_of_speech", "spacy.strings", diff --git a/spacy/about.py b/spacy/about.py index 5b2a89c61..8f374e2fe 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,8 +1,7 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a1" +__version__ = "3.0.0a2" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" -__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json" __projects__ = "https://github.com/explosion/spacy-boilerplates" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5dc3070b6..0568b34de 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,8 +15,10 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_clone, project_assets, project_run # noqa: F401 -from .project import project_run_all # noqa: F401 +from .project.clone import project_clone # noqa: F401 +from .project.assets import project_assets # noqa: F401 +from .project.run import project_run # noqa: F401 +from .project.dvc import project_update_dvc # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index 2b3ad9524..e970c4dde 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface DOCS: https://spacy.io/api/cli """ +PROJECT_HELP = f"""Command-line interface for spaCy projects and working with +project templates. You'd typically start by cloning a project template to a local +directory and fetching its assets like datasets etc. See the project's +project.yml for the available commands. +""" app = typer.Typer(name=NAME, help=HELP) +project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) +app.add_typer(project_cli) # Wrappers for Typer's annotations. Initially created to set defaults and to # keep the names short, but not needed at the moment. diff --git a/spacy/cli/download.py b/spacy/cli/download.py index ea5e7a890..f192cb196 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,4 +1,4 @@ -from typing import Optional, Sequence, Union +from typing import Optional, Sequence import requests import sys from wasabi import msg @@ -8,6 +8,23 @@ from ._app import app, Arg, Opt from .. import about from ..util import is_package, get_base_version, run_command +# These are the old shortcuts we previously supported in spacy download. As of +# v3, shortcuts are deprecated so we're not expecting to add anything to this +# list. It only exists to show users warnings. +OLD_SHORTCUTS = { + "en": "en_core_web_sm", + "de": "de_core_news_sm", + "es": "es_core_news_sm", + "pt": "pt_core_news_sm", + "fr": "fr_core_news_sm", + "it": "it_core_news_sm", + "nl": "nl_core_news_sm", + "el": "el_core_news_sm", + "nb": "nb_core_news_sm", + "lt": "lt_core_news_sm", + "xx": "xx_ent_wiki_sm", +} + @app.command( "download", @@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None: version = components[-1] download_model(dl_tpl.format(m=model_name, v=version), pip_args) else: - shortcuts = get_json(about.__shortcuts__, "available shortcuts") - model_name = shortcuts.get(model, model) + model_name = model + if model in OLD_SHORTCUTS: + msg.warn( + f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. " + f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead." + ) + model_name = OLD_SHORTCUTS[model] compatibility = get_compatibility() version = get_version(model_name, compatibility) download_model(dl_tpl.format(m=model_name, v=version), pip_args) @@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None: ) -def get_json(url: str, desc: str) -> Union[dict, list]: - r = requests.get(url) +def get_compatibility() -> dict: + version = get_base_version(about.__version__) + r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( f"Server error ({r.status_code})", - f"Couldn't fetch {desc}. Please find a model for your spaCy " + f"Couldn't fetch compatibility table. Please find a model for your spaCy " f"installation (v{about.__version__}), and download it manually. " f"For more details, see the documentation: " f"https://spacy.io/usage/models", exits=1, ) - return r.json() - - -def get_compatibility() -> dict: - version = get_base_version(about.__version__) - comp_table = get_json(about.__compatibility__, "compatibility table") + comp_table = r.json() comp = comp_table["spacy"] if version not in comp: msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) diff --git a/spacy/cli/project.py b/spacy/cli/project.py deleted file mode 100644 index 200471127..000000000 --- a/spacy/cli/project.py +++ /dev/null @@ -1,708 +0,0 @@ -from typing import List, Dict, Any, Optional, Sequence -import typer -import srsly -from pathlib import Path -from wasabi import msg -import subprocess -import os -import re -import shutil -import sys -import requests -import tqdm - -from ._app import app, Arg, Opt, COMMAND, NAME -from .. import about -from ..schemas import ProjectConfigSchema, validate -from ..util import ensure_path, run_command, make_tempdir, working_dir -from ..util import get_hash, get_checksum, split_command - - -CONFIG_FILE = "project.yml" -DVC_CONFIG = "dvc.yaml" -DVC_DIR = ".dvc" -DIRS = [ - "assets", - "metas", - "configs", - "packages", - "metrics", - "scripts", - "notebooks", - "training", - "corpus", -] -CACHES = [ - Path.home() / ".torch", - Path.home() / ".caches" / "torch", - os.environ.get("TORCH_HOME"), - Path.home() / ".keras", -] -DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit -# it directly and edit the project.yml instead and re-run the project.""" -CLI_HELP = f"""Command-line interface for spaCy projects and working with project -templates. You'd typically start by cloning a project template to a local -directory and fetching its assets like datasets etc. See the project's -{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data -Version Control) to manage input and output files and to ensure steps are only -re-run if their inputs change. -""" - -project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True) - - -@project_cli.callback(invoke_without_command=True) -def callback(ctx: typer.Context): - """This runs before every project command and ensures DVC is installed.""" - ensure_dvc() - - -################ -# CLI COMMANDS # -################ - - -@project_cli.command("clone") -def project_clone_cli( - # fmt: off - name: str = Arg(..., help="The name of the template to fetch"), - dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), - repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), - git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), - no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), - # fmt: on -): - """Clone a project template from a repository. Calls into "git" and will - only download the files from the given subdirectory. The GitHub repo - defaults to the official spaCy template repo, but can be customized - (including using a private repo). Setting the --git flag will also - initialize the project directory as a Git repo. If the project is intended - to be a Git repo, it should be initialized with Git first, before - initializing DVC (Data Version Control). This allows DVC to integrate with - Git. - """ - if dest == Path.cwd(): - dest = dest / name - project_clone(name, dest, repo=repo, git=git, no_init=no_init) - - -@project_cli.command("init") -def project_init_cli( - # fmt: off - path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), - force: bool = Opt(False, "--force", "-F", help="Force initiziation"), - # fmt: on -): - """Initialize a project directory with DVC and optionally Git. This should - typically be taken care of automatically when you run the "project clone" - command, but you can also run it separately. If the project is intended to - be a Git repo, it should be initialized with Git first, before initializing - DVC. This allows DVC to integrate with Git. - """ - project_init(path, git=git, force=force, silent=True) - - -@project_cli.command("assets") -def project_assets_cli( - # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - # fmt: on -): - """Use DVC (Data Version Control) to fetch project assets. Assets are - defined in the "assets" section of the project config. If possible, DVC - will try to track the files so you can pull changes from upstream. It will - also try and store the checksum so the assets are versioned. If the file - can't be tracked or checked, it will be downloaded without DVC. If a checksum - is provided in the project config, the file is only downloaded if no local - file with the same checksum exists. - """ - project_assets(project_dir) - - -@project_cli.command( - "run-all", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_run_all_cli( - # fmt: off - ctx: typer.Context, - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run all commands defined in the project. This command will use DVC and - the defined outputs and dependencies in the project config to determine - which steps need to be re-run and where to start. This means you're only - re-generating data if the inputs have changed. - - This command calls into "dvc repro" and all additional arguments are passed - to the "dvc repro" command: https://dvc.org/doc/command-reference/repro - """ - if show_help: - print_run_help(project_dir) - else: - project_run_all(project_dir, *ctx.args) - - -@project_cli.command( - "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_run_cli( - # fmt: off - ctx: typer.Context, - subcommand: str = Arg(None, help="Name of command defined in project config"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run a named script defined in the project config. If the command is - part of the default pipeline defined in the "run" section, DVC is used to - determine whether the step should re-run if its inputs have changed, or - whether everything is up to date. If the script is not part of the default - pipeline, it will be called separately without DVC. - - If DVC is used, the command calls into "dvc repro" and all additional - arguments are passed to the "dvc repro" command: - https://dvc.org/doc/command-reference/repro - """ - if show_help or not subcommand: - print_run_help(project_dir, subcommand) - else: - project_run(project_dir, subcommand, *ctx.args) - - -@project_cli.command("exec", hidden=True) -def project_exec_cli( - # fmt: off - subcommand: str = Arg(..., help="Name of command defined in project config"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - # fmt: on -): - """Execute a command defined in the project config. This CLI command is - only called internally in auto-generated DVC pipelines, as a shortcut for - multi-step commands in the project config. You typically shouldn't have to - call it yourself. To run a command, call "run" or "run-all". - """ - project_exec(project_dir, subcommand) - - -@project_cli.command("update-dvc") -def project_update_dvc_cli( - # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), - force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), - # fmt: on -): - """Update the auto-generated DVC config file. Uses the steps defined in the - "run" section of the project config. This typically happens automatically - when running a command, but can also be triggered manually if needed. - """ - config = load_project_config(project_dir) - updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) - if updated: - msg.good(f"Updated DVC config from {CONFIG_FILE}") - else: - msg.info(f"No changes found in {CONFIG_FILE}, no update needed") - - -app.add_typer(project_cli, name="project") - - -################# -# CLI FUNCTIONS # -################# - - -def project_clone( - name: str, - dest: Path, - *, - repo: str = about.__projects__, - git: bool = False, - no_init: bool = False, -) -> None: - """Clone a project template from a repository. - - name (str): Name of subdirectory to clone. - dest (Path): Destination path of cloned project. - repo (str): URL of Git repo containing project templates. - git (bool): Initialize project as Git repo. Should be set to True if project - is intended as a repo, since it will allow DVC to integrate with Git. - no_init (bool): Don't initialize DVC and Git automatically. If True, the - "init" command or "git init" and "dvc init" need to be run manually. - """ - dest = ensure_path(dest) - check_clone(name, dest, repo) - project_dir = dest.resolve() - # We're using Git and sparse checkout to only clone the files we need - with make_tempdir() as tmp_dir: - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" - try: - run_command(cmd) - except SystemExit: - err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." - msg.fail(err) - with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: - f.write(name) - try: - run_command(["git", "-C", str(tmp_dir), "fetch"]) - run_command(["git", "-C", str(tmp_dir), "checkout"]) - except SystemExit: - err = f"Could not clone '{name}' in the repo '{repo}'." - msg.fail(err) - shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) - msg.good(f"Cloned project '{name}' from {repo} into {project_dir}") - for sub_dir in DIRS: - dir_path = project_dir / sub_dir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - if not no_init: - project_init(project_dir, git=git, force=True, silent=True) - msg.good(f"Your project is now ready!", dest) - print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") - - -def project_init( - project_dir: Path, - *, - git: bool = False, - force: bool = False, - silent: bool = False, - analytics: bool = False, -): - """Initialize a project as a DVC and (optionally) as a Git repo. - - project_dir (Path): Path to project directory. - git (bool): Also call "git init" to initialize directory as a Git repo. - silent (bool): Don't print any output (via DVC). - analytics (bool): Opt-in to DVC analytics (defaults to False). - """ - with working_dir(project_dir) as cwd: - if git: - run_command(["git", "init"]) - init_cmd = ["dvc", "init"] - if silent: - init_cmd.append("--quiet") - if not git: - init_cmd.append("--no-scm") - if force: - init_cmd.append("--force") - run_command(init_cmd) - # We don't want to have analytics on by default – our users should - # opt-in explicitly. If they want it, they can always enable it. - if not analytics: - run_command(["dvc", "config", "core.analytics", "false"]) - # Remove unused and confusing plot templates from .dvc directory - # TODO: maybe we shouldn't do this, but it's otherwise super confusing - # once you commit your changes via Git and it creates a bunch of files - # that have no purpose - plots_dir = cwd / DVC_DIR / "plots" - if plots_dir.exists(): - shutil.rmtree(str(plots_dir)) - config = load_project_config(cwd) - setup_check_dvc(cwd, config) - - -def project_assets(project_dir: Path) -> None: - """Fetch assets for a project using DVC if possible. - - project_dir (Path): Path to project directory. - """ - project_path = ensure_path(project_dir) - config = load_project_config(project_path) - setup_check_dvc(project_path, config) - assets = config.get("assets", {}) - if not assets: - msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) - msg.info(f"Fetching {len(assets)} asset(s)") - variables = config.get("variables", {}) - fetched_assets = [] - for asset in assets: - url = asset["url"].format(**variables) - dest = asset["dest"].format(**variables) - fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) - if fetched_path: - fetched_assets.append(str(fetched_path)) - if fetched_assets: - with working_dir(project_path): - run_command(["dvc", "add", *fetched_assets, "--external"]) - - -def fetch_asset( - project_path: Path, url: str, dest: Path, checksum: Optional[str] = None -) -> Optional[Path]: - """Fetch an asset from a given URL or path. Will try to import the file - using DVC's import-url if possible (fully tracked and versioned) and falls - back to get-url (versioned) and a non-DVC download if necessary. If a - checksum is provided and a local file exists, it's only re-downloaded if the - checksum doesn't match. - - project_path (Path): Path to project directory. - url (str): URL or path to asset. - checksum (Optional[str]): Optional expected checksum of local file. - RETURNS (Optional[Path]): The path to the fetched asset or None if fetching - the asset failed. - """ - url = convert_asset_url(url) - dest_path = (project_path / dest).resolve() - if dest_path.exists() and checksum: - # If there's already a file, check for checksum - # TODO: add support for caches (dvc import-url with local path) - if checksum == get_checksum(dest_path): - msg.good(f"Skipping download with matching checksum: {dest}") - return dest_path - with working_dir(project_path): - try: - # If these fail, we don't want to output an error or info message. - # Try with tracking the source first, then just downloading with - # DVC, then a regular non-DVC download. - try: - dvc_cmd = ["dvc", "import-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: - dvc_cmd = ["dvc", "get-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: - try: - download_file(url, dest_path) - except requests.exceptions.HTTPError as e: - msg.fail(f"Download failed: {dest}", e) - return None - if checksum and checksum != get_checksum(dest_path): - msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") - msg.good(f"Fetched asset {dest}") - return dest_path - - -def project_run_all(project_dir: Path, *dvc_args) -> None: - """Run all commands defined in the project using DVC. - - project_dir (Path): Path to project directory. - *dvc_args: Other arguments passed to "dvc repro". - """ - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) - dvc_cmd = ["dvc", "repro", *dvc_args] - with working_dir(project_dir): - run_command(dvc_cmd) - - -def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: - """Simulate a CLI help prompt using the info available in the project config. - - project_dir (Path): The project directory. - subcommand (Optional[str]): The subcommand or None. If a subcommand is - provided, the subcommand help is shown. Otherwise, the top-level help - and a list of available commands is printed. - """ - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) - config_commands = config.get("commands", []) - commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand: - validate_subcommand(commands.keys(), subcommand) - print(f"Usage: {COMMAND} project run {subcommand} {project_dir}") - help_text = commands[subcommand].get("help") - if help_text: - msg.text(f"\n{help_text}\n") - else: - print(f"\nAvailable commands in {CONFIG_FILE}") - print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - msg.text("Run all commands defined in the 'run' block of the project config:") - print(f"{COMMAND} project run-all {project_dir}") - - -def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: - """Run a named script defined in the project config. If the script is part - of the default pipeline (defined in the "run" section), DVC is used to - execute the command, so it can determine whether to rerun it. It then - calls into "exec" to execute it. - - project_dir (Path): Path to project directory. - subcommand (str): Name of command to run. - *dvc_args: Other arguments passed to "dvc repro". - """ - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) - config_commands = config.get("commands", []) - variables = config.get("variables", {}) - commands = {cmd["name"]: cmd for cmd in config_commands} - validate_subcommand(commands.keys(), subcommand) - if subcommand in config.get("run", []): - # This is one of the pipeline commands tracked in DVC - dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] - with working_dir(project_dir): - run_command(dvc_cmd) - else: - cmd = commands[subcommand] - # Deps in non-DVC commands aren't tracked, but if they're defined, - # make sure they exist before running the command - for dep in cmd.get("deps", []): - if not (project_dir / dep).exists(): - err = f"Missing dependency specified by command '{subcommand}': {dep}" - msg.fail(err, exits=1) - with working_dir(project_dir): - run_commands(cmd["script"], variables) - - -def project_exec(project_dir: Path, subcommand: str): - """Execute a command defined in the project config. - - project_dir (Path): Path to project directory. - subcommand (str): Name of command to run. - """ - config = load_project_config(project_dir) - config_commands = config.get("commands", []) - variables = config.get("variables", {}) - commands = {cmd["name"]: cmd for cmd in config_commands} - with working_dir(project_dir): - run_commands(commands[subcommand]["script"], variables) - - -########### -# HELPERS # -########### - - -def load_project_config(path: Path) -> Dict[str, Any]: - """Load the project config file from a directory and validate it. - - path (Path): The path to the project directory. - RETURNS (Dict[str, Any]): The loaded project config. - """ - config_path = path / CONFIG_FILE - if not config_path.exists(): - msg.fail("Can't find project config", config_path, exits=1) - invalid_err = f"Invalid project config in {CONFIG_FILE}" - try: - config = srsly.read_yaml(config_path) - except ValueError as e: - msg.fail(invalid_err, e, exits=1) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(invalid_err, "\n".join(errors), exits=1) - return config - - -def update_dvc_config( - path: Path, - config: Dict[str, Any], - verbose: bool = False, - silent: bool = False, - force: bool = False, -) -> bool: - """Re-run the DVC commands in dry mode and update dvc.yaml file in the - project directory. The file is auto-generated based on the config. The - first line of the auto-generated file specifies the hash of the config - dict, so if any of the config values change, the DVC config is regenerated. - - path (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project config. - verbose (bool): Whether to print additional info (via DVC). - silent (bool): Don't output anything (via DVC). - force (bool): Force update, even if hashes match. - RETURNS (bool): Whether the DVC config file was updated. - """ - config_hash = get_hash(config) - path = path.resolve() - dvc_config_path = path / DVC_CONFIG - if dvc_config_path.exists(): - # Check if the file was generated using the current config, if not, redo - with dvc_config_path.open("r", encoding="utf8") as f: - ref_hash = f.readline().strip().replace("# ", "") - if ref_hash == config_hash and not force: - return False # Nothing has changed in project config, don't need to update - dvc_config_path.unlink() - variables = config.get("variables", {}) - commands = [] - # We only want to include commands that are part of the main list of "run" - # commands in project.yml and should be run in sequence - config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} - for name in config.get("run", []): - validate_subcommand(config_commands.keys(), name) - command = config_commands[name] - deps = command.get("deps", []) - outputs = command.get("outputs", []) - outputs_no_cache = command.get("outputs_no_cache", []) - if not deps and not outputs and not outputs_no_cache: - continue - # Default to the working dir as the project path since dvc.yaml is auto-generated - # and we don't want arbitrary paths in there - project_cmd = ["python", "-m", NAME, "project", "exec", name] - deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] - outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] - outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] - dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] - if verbose: - dvc_cmd.append("--verbose") - if silent: - dvc_cmd.append("--quiet") - full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] - commands.append(" ".join(full_cmd)) - with working_dir(path): - run_commands(commands, variables, silent=True) - with dvc_config_path.open("r+", encoding="utf8") as f: - content = f.read() - f.seek(0, 0) - f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") - return True - - -def ensure_dvc() -> None: - """Ensure that the "dvc" command is available and show an error if not.""" - try: - subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - "spaCy projects require DVC (Data Version Control) and the 'dvc' command", - "You can install the Python package from pip (pip install dvc) or " - "conda (conda install -c conda-forge dvc). For more details, see the " - "documentation: https://dvc.org/doc/install", - exits=1, - ) - - -def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: - """Check that the project is set up correctly with DVC and update its - config if needed. Will raise an error if the project is not an initialized - DVC project. - - project_dir (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project config. - """ - if not project_dir.exists(): - msg.fail(f"Can't find project directory: {project_dir}") - if not (project_dir / ".dvc").exists(): - msg.fail( - "Project not initialized as a DVC project.", - f"Make sure that the project template was cloned correctly. To " - f"initialize the project directory manually, you can run: " - f"{COMMAND} project init {project_dir}", - exits=1, - ) - with msg.loading("Updating DVC config..."): - updated = update_dvc_config(project_dir, config, silent=True) - if updated: - msg.good(f"Updated DVC config from changed {CONFIG_FILE}") - - -def run_commands( - commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False -) -> None: - """Run a sequence of commands in a subprocess, in order. - - commands (List[str]): The string commands. - variables (Dict[str, str]): Dictionary of variable names, mapped to their - values. Will be used to substitute format string variables in the - commands. - silent (bool): Don't print the commands. - """ - for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) - command = split_command(command) - # Not sure if this is needed or a good idea. Motivation: users may often - # use commands in their config that reference "python" and we want to - # make sure that it's always executing the same Python that spaCy is - # executed with and the pip in the same env, not some other Python/pip. - # Also ensures cross-compatibility if user 1 writes "python3" (because - # that's how it's set up on their system), and user 2 without the - # shortcut tries to re-run the command. - if len(command) and command[0] in ("python", "python3"): - command[0] = sys.executable - elif len(command) and command[0] in ("pip", "pip3"): - command = [sys.executable, "-m", "pip", *command[1:]] - if not silent: - print(f"Running command: {' '.join(command)}") - run_command(command) - - -def convert_asset_url(url: str) -> str: - """Check and convert the asset URL if needed. - - url (str): The asset URL. - RETURNS (str): The converted URL. - """ - # If the asset URL is a regular GitHub URL it's likely a mistake - if re.match("(http(s?)):\/\/github.com", url): - converted = url.replace("github.com", "raw.githubusercontent.com") - converted = re.sub(r"/(tree|blob)/", "/", converted) - msg.warn( - "Downloading from a regular GitHub URL. This will only download " - "the source of the page, not the actual file. Converting the URL " - "to a raw URL.", - converted, - ) - return converted - return url - - -def check_clone(name: str, dest: Path, repo: str) -> None: - """Check and validate that the destination path can be used to clone. Will - check that Git is available and that the destination path is suitable. - - name (str): Name of the directory to clone from the repo. - dest (Path): Local destination of cloned directory. - repo (str): URL of the repo to clone from. - """ - try: - subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - f"Cloning spaCy project templates requires Git and the 'git' command. ", - f"To clone a project without Git, copy the files from the '{name}' " - f"directory in the {repo} to {dest} manually and then run:", - f"{COMMAND} project init {dest}", - exits=1, - ) - if not dest: - msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) - if dest.exists(): - # Directory already exists (not allowed, clone needs to create it) - msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) - if not dest.parent.exists(): - # We're not creating parents, parent dir should exist - msg.fail( - f"Can't clone project, parent directory doesn't exist: {dest.parent}", - exits=1, - ) - - -def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: - """Check that a subcommand is valid and defined. Raises an error otherwise. - - commands (Sequence[str]): The available commands. - subcommand (str): The subcommand. - """ - if subcommand not in commands: - msg.fail( - f"Can't find command '{subcommand}' in {CONFIG_FILE}. " - f"Available commands: {', '.join(commands)}", - exits=1, - ) - - -def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: - """Download a file using requests. - - url (str): The URL of the file. - dest (Path): The destination path. - chunk_size (int): The size of chunks to read/write. - """ - response = requests.get(url, stream=True) - response.raise_for_status() - total = int(response.headers.get("content-length", 0)) - progress_settings = { - "total": total, - "unit": "iB", - "unit_scale": True, - "unit_divisor": chunk_size, - "leave": False, - } - with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: - for data in response.iter_content(chunk_size=chunk_size): - size = f.write(data) - bar.update(size) diff --git a/spacy/cli/project/__init__.py b/spacy/cli/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py new file mode 100644 index 000000000..0ef3419f3 --- /dev/null +++ b/spacy/cli/project/assets.py @@ -0,0 +1,154 @@ +from typing import Optional +from pathlib import Path +from wasabi import msg +import requests +import tqdm +import re +import shutil + +from ...util import ensure_path, get_checksum, working_dir +from .._app import project_cli, Arg +from .util import PROJECT_FILE, load_project_config + + +# TODO: find a solution for caches +# CACHES = [ +# Path.home() / ".torch", +# Path.home() / ".caches" / "torch", +# os.environ.get("TORCH_HOME"), +# Path.home() / ".keras", +# ] + + +@project_cli.command("assets") +def project_assets_cli( + # fmt: off + project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), + # fmt: on +): + """Fetch project assets like datasets and pretrained weights. Assets are + defined in the "assets" section of the project.yml. If a checksum is + provided in the project.yml, the file is only downloaded if no local file + with the same checksum exists. + """ + project_assets(project_dir) + + +def project_assets(project_dir: Path) -> None: + """Fetch assets for a project using DVC if possible. + + project_dir (Path): Path to project directory. + """ + project_path = ensure_path(project_dir) + config = load_project_config(project_path) + assets = config.get("assets", {}) + if not assets: + msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) + msg.info(f"Fetching {len(assets)} asset(s)") + variables = config.get("variables", {}) + for asset in assets: + dest = asset["dest"].format(**variables) + url = asset.get("url") + checksum = asset.get("checksum") + if not url: + # project.yml defines asset without URL that the user has to place + check_private_asset(dest, checksum) + continue + url = url.format(**variables) + fetch_asset(project_path, url, dest, checksum) + + +def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: + """Check and validate assets without a URL (private assets that the user + has to provide themselves) and give feedback about the checksum. + + dest (Path): Desintation path of the asset. + checksum (Optional[str]): Optional checksum of the expected file. + """ + if not Path(dest).exists(): + err = f"No URL provided for asset. You need to add this file yourself: {dest}" + msg.warn(err) + else: + if checksum and checksum == get_checksum(dest): + msg.good(f"Asset exists with matching checksum: {dest}") + else: + msg.fail(f"Asset available but with incorrect checksum: {dest}") + + +def fetch_asset( + project_path: Path, url: str, dest: Path, checksum: Optional[str] = None +) -> None: + """Fetch an asset from a given URL or path. If a checksum is provided and a + local file exists, it's only re-downloaded if the checksum doesn't match. + + project_path (Path): Path to project directory. + url (str): URL or path to asset. + checksum (Optional[str]): Optional expected checksum of local file. + RETURNS (Optional[Path]): The path to the fetched asset or None if fetching + the asset failed. + """ + # TODO: add support for caches + dest_path = (project_path / dest).resolve() + if dest_path.exists() and checksum: + # If there's already a file, check for checksum + if checksum == get_checksum(dest_path): + msg.good(f"Skipping download with matching checksum: {dest}") + return dest_path + with working_dir(project_path): + url = convert_asset_url(url) + try: + download_file(url, dest_path) + msg.good(f"Downloaded asset {dest}") + except requests.exceptions.RequestException as e: + if Path(url).exists() and Path(url).is_file(): + # If it's a local file, copy to destination + shutil.copy(url, str(dest_path)) + msg.good(f"Copied local asset {dest}") + else: + msg.fail(f"Download failed: {dest}", e) + return + if checksum and checksum != get_checksum(dest_path): + msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") + + +def convert_asset_url(url: str) -> str: + """Check and convert the asset URL if needed. + + url (str): The asset URL. + RETURNS (str): The converted URL. + """ + # If the asset URL is a regular GitHub URL it's likely a mistake + if re.match(r"(http(s?)):\/\/github.com", url): + converted = url.replace("github.com", "raw.githubusercontent.com") + converted = re.sub(r"/(tree|blob)/", "/", converted) + msg.warn( + "Downloading from a regular GitHub URL. This will only download " + "the source of the page, not the actual file. Converting the URL " + "to a raw URL.", + converted, + ) + return converted + return url + + +def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: + """Download a file using requests. + + url (str): The URL of the file. + dest (Path): The destination path. + chunk_size (int): The size of chunks to read/write. + """ + response = requests.get(url, stream=True) + response.raise_for_status() + total = int(response.headers.get("content-length", 0)) + progress_settings = { + "total": total, + "unit": "iB", + "unit_scale": True, + "unit_divisor": chunk_size, + "leave": False, + } + with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: + for data in response.iter_content(chunk_size=chunk_size): + size = f.write(data) + bar.update(size) diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py new file mode 100644 index 000000000..ee1fd790c --- /dev/null +++ b/spacy/cli/project/clone.py @@ -0,0 +1,110 @@ +from pathlib import Path +from wasabi import msg +import subprocess +import shutil + +from ... import about +from ...util import ensure_path, run_command, make_tempdir +from .._app import project_cli, Arg, Opt, COMMAND + + +DIRS = [ + "assets", + "metas", + "configs", + "packages", + "metrics", + "scripts", + "notebooks", + "training", + "corpus", +] + + +@project_cli.command("clone") +def project_clone_cli( + # fmt: off + name: str = Arg(..., help="The name of the template to fetch"), + dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), + repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + # fmt: on +): + """Clone a project template from a repository. Calls into "git" and will + only download the files from the given subdirectory. The GitHub repo + defaults to the official spaCy template repo, but can be customized + (including using a private repo). Setting the --git flag will also + initialize the project directory as a Git repo. If the project is intended + to be a Git repo, it should be initialized with Git first, before + initializing DVC (Data Version Control). This allows DVC to integrate with + Git. + """ + if dest == Path.cwd(): + dest = dest / name + project_clone(name, dest, repo=repo) + + +def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None: + """Clone a project template from a repository. + + name (str): Name of subdirectory to clone. + dest (Path): Destination path of cloned project. + repo (str): URL of Git repo containing project templates. + """ + dest = ensure_path(dest) + check_clone(name, dest, repo) + project_dir = dest.resolve() + # We're using Git and sparse checkout to only clone the files we need + with make_tempdir() as tmp_dir: + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" + try: + run_command(cmd) + except subprocess.CalledProcessError: + err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." + msg.fail(err) + with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: + f.write(name) + try: + run_command(["git", "-C", str(tmp_dir), "fetch"]) + run_command(["git", "-C", str(tmp_dir), "checkout"]) + except subprocess.CalledProcessError: + err = f"Could not clone '{name}' in the repo '{repo}'." + msg.fail(err) + shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) + msg.good(f"Cloned project '{name}' from {repo} into {project_dir}") + for sub_dir in DIRS: + dir_path = project_dir / sub_dir + if not dir_path.exists(): + dir_path.mkdir(parents=True) + msg.good(f"Your project is now ready!", dest) + print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") + + +def check_clone(name: str, dest: Path, repo: str) -> None: + """Check and validate that the destination path can be used to clone. Will + check that Git is available and that the destination path is suitable. + + name (str): Name of the directory to clone from the repo. + dest (Path): Local destination of cloned directory. + repo (str): URL of the repo to clone from. + """ + try: + subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + f"Cloning spaCy project templates requires Git and the 'git' command. ", + f"To clone a project without Git, copy the files from the '{name}' " + f"directory in the {repo} to {dest} manually and then run:", + f"{COMMAND} project init {dest}", + exits=1, + ) + if not dest: + msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) + if dest.exists(): + # Directory already exists (not allowed, clone needs to create it) + msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) + if not dest.parent.exists(): + # We're not creating parents, parent dir should exist + msg.fail( + f"Can't clone project, parent directory doesn't exist: {dest.parent}", + exits=1, + ) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py new file mode 100644 index 000000000..a98cb939a --- /dev/null +++ b/spacy/cli/project/dvc.py @@ -0,0 +1,206 @@ +"""This module contains helpers and subcommands for integrating spaCy projects +with Data Version Controk (DVC). https://dvc.org""" +from typing import Dict, Any, List, Optional +import subprocess +from pathlib import Path +from wasabi import msg + +from .util import PROJECT_FILE, load_project_config +from .._app import project_cli, Arg, Opt, NAME, COMMAND +from ...util import get_hash, working_dir, split_command, join_command, run_command + + +DVC_CONFIG = "dvc.yaml" +DVC_DIR = ".dvc" +UPDATE_COMMAND = "dvc" +DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've +# edited your {PROJECT_FILE}, you can regenerate this file by running: +# {COMMAND} project {UPDATE_COMMAND}""" + + +@project_cli.command(UPDATE_COMMAND) +def project_update_dvc_cli( + # fmt: off + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), + workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), + verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), + # fmt: on +): + """Auto-generate Data Version Control (DVC) config. A DVC + project can only define one pipeline, so you need to specify one workflow + defined in the project.yml. If no workflow is specified, the first defined + workflow is used. The DVC config will only be updated if + """ + project_update_dvc(project_dir, workflow, verbose=verbose, force=force) + + +def project_update_dvc( + project_dir: Path, + workflow: Optional[str] = None, + *, + verbose: bool = False, + force: bool = False, +) -> None: + """Update the auto-generated Data Version Control (DVC) config file. A DVC + project can only define one pipeline, so you need to specify one workflow + defined in the project.yml. Will only update the file if the checksum changed. + + project_dir (Path): The project directory. + workflow (Optional[str]): Optional name of workflow defined in project.yml. + If not set, the first workflow will be used. + verbose (bool): Print more info. + force (bool): Force update DVC config. + """ + config = load_project_config(project_dir) + updated = update_dvc_config( + project_dir, config, workflow, verbose=verbose, force=force + ) + help_msg = "To execute the workflow with DVC, run: dvc repro" + if updated: + msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) + else: + msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg) + + +def update_dvc_config( + path: Path, + config: Dict[str, Any], + workflow: Optional[str] = None, + verbose: bool = False, + silent: bool = False, + force: bool = False, +) -> bool: + """Re-run the DVC commands in dry mode and update dvc.yaml file in the + project directory. The file is auto-generated based on the config. The + first line of the auto-generated file specifies the hash of the config + dict, so if any of the config values change, the DVC config is regenerated. + + path (Path): The path to the project directory. + config (Dict[str, Any]): The loaded project.yml. + verbose (bool): Whether to print additional info (via DVC). + silent (bool): Don't output anything (via DVC). + force (bool): Force update, even if hashes match. + RETURNS (bool): Whether the DVC config file was updated. + """ + ensure_dvc(path) + workflows = config.get("workflows", {}) + workflow_names = list(workflows.keys()) + check_workflows(workflow_names, workflow) + if not workflow: + workflow = workflow_names[0] + config_hash = get_hash(config) + path = path.resolve() + dvc_config_path = path / DVC_CONFIG + if dvc_config_path.exists(): + # Check if the file was generated using the current config, if not, redo + with dvc_config_path.open("r", encoding="utf8") as f: + ref_hash = f.readline().strip().replace("# ", "") + if ref_hash == config_hash and not force: + return False # Nothing has changed in project.yml, don't need to update + dvc_config_path.unlink() + variables = config.get("variables", {}) + dvc_commands = [] + config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + for name in workflows[workflow]: + command = config_commands[name] + deps = command.get("deps", []) + outputs = command.get("outputs", []) + outputs_no_cache = command.get("outputs_no_cache", []) + if not deps and not outputs and not outputs_no_cache: + continue + # Default to the working dir as the project path since dvc.yaml is auto-generated + # and we don't want arbitrary paths in there + project_cmd = ["python", "-m", NAME, "project", "run", name] + deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] + outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] + outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] + dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] + full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] + dvc_commands.append(join_command(full_cmd)) + with working_dir(path): + dvc_flags = {"--verbose": verbose, "--quiet": silent} + run_dvc_commands(dvc_commands, variables, flags=dvc_flags) + with dvc_config_path.open("r+", encoding="utf8") as f: + content = f.read() + f.seek(0, 0) + f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") + return True + + +def run_dvc_commands( + commands: List[str] = tuple(), + variables: Dict[str, str] = {}, + flags: Dict[str, bool] = {}, +) -> None: + """Run a sequence of DVC commands in a subprocess, in order. + + commands (List[str]): The string commands without the leading "dvc". + variables (Dict[str, str]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + flags (Dict[str, bool]): Conditional flags to be added to command. Makes it + easier to pass flags like --quiet that depend on a variable or + command-line setting while avoiding lots of nested conditionals. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = split_command(command) + dvc_command = ["dvc", *command] + # Add the flags if they are set to True + for flag, is_active in flags.items(): + if is_active: + dvc_command.append(flag) + run_command(dvc_command) + + +def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: + """Validate workflows provided in project.yml and check that a given + workflow can be used to generate a DVC config. + + workflows (List[str]): Names of the available workflows. + workflow (Optional[str]): The name of the workflow to convert. + """ + if not workflows: + msg.fail( + f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " + f"define at least one list of commands.", + exits=1, + ) + if workflow is not None and workflow not in workflows: + msg.fail( + f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " + f"Available workflows: {', '.join(workflows)}", + exits=1, + ) + if not workflow: + msg.warn( + f"No workflow specified for DVC pipeline. Using the first workflow " + f"defined in {PROJECT_FILE}: '{workflows[0]}'" + ) + + +def ensure_dvc(project_dir: Path) -> None: + """Ensure that the "dvc" command is available and that the current project + directory is an initialized DVC project. + """ + try: + subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + "To use spaCy projects with DVC (Data Version Control), DVC needs " + "to be installed and the 'dvc' command needs to be available", + "You can install the Python package from pip (pip install dvc) or " + "conda (conda install -c conda-forge dvc). For more details, see the " + "documentation: https://dvc.org/doc/install", + exits=1, + ) + if not (project_dir / ".dvc").exists(): + msg.fail( + "Project not initialized as a DVC project", + "To initialize a DVC project, you can run 'dvc init' in the project " + "directory. For more details, see the documentation: " + "https://dvc.org/doc/command-reference/init", + exits=1, + ) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py new file mode 100644 index 000000000..a4d7dd644 --- /dev/null +++ b/spacy/cli/project/run.py @@ -0,0 +1,250 @@ +from typing import Optional, List, Dict, Sequence, Any +from pathlib import Path +from wasabi import msg +import typer +import sys +import srsly + +from ...util import working_dir, run_command, split_command, is_cwd, get_checksum +from ...util import get_hash, join_command +from .._app import project_cli, Arg, Opt, COMMAND +from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config + + +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_cli( + # fmt: off + ctx: typer.Context, + subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), + force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), + dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run a named script or workflow defined in the project.yml. If a workflow + name is specified, all commands in the workflow are run, in order. If + commands define inputs and/or outputs, they will only be re-run if state + has changed. + """ + if show_help or not subcommand: + print_run_help(project_dir, subcommand) + else: + project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry) + + +def project_run( + project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False +) -> None: + """Run a named script defined in the project.yml. If the script is part + of the default pipeline (defined in the "run" section), DVC is used to + execute the command, so it can determine whether to rerun it. It then + calls into "exec" to execute it. + + project_dir (Path): Path to project directory. + subcommand (str): Name of command to run. + force (bool): Force re-running, even if nothing changed. + dry (bool): Perform a dry run and don't execute commands. + """ + config = load_project_config(project_dir) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + workflows = config.get("workflows", {}) + validate_subcommand(commands.keys(), workflows.keys(), subcommand) + if subcommand in workflows: + msg.info(f"Running workflow '{subcommand}'") + for cmd in workflows[subcommand]: + project_run(project_dir, cmd, force=force, dry=dry) + else: + cmd = commands[subcommand] + variables = config.get("variables", {}) + for dep in cmd.get("deps", []): + dep = dep.format(**variables) + if not (project_dir / dep).exists(): + err = f"Missing dependency specified by command '{subcommand}': {dep}" + err_kwargs = {"exits": 1} if not dry else {} + msg.fail(err, **err_kwargs) + with working_dir(project_dir) as current_dir: + rerun = check_rerun(current_dir, cmd, variables) + if not rerun and not force: + msg.info(f"Skipping '{cmd['name']}': nothing changed") + else: + msg.divider(subcommand) + run_commands(cmd["script"], variables, dry=dry) + update_lockfile(current_dir, cmd, variables) + + +def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: + """Simulate a CLI help prompt using the info available in the project.yml. + + project_dir (Path): The project directory. + subcommand (Optional[str]): The subcommand or None. If a subcommand is + provided, the subcommand help is shown. Otherwise, the top-level help + and a list of available commands is printed. + """ + config = load_project_config(project_dir) + config_commands = config.get("commands", []) + commands = {cmd["name"]: cmd for cmd in config_commands} + project_loc = "" if is_cwd(project_dir) else project_dir + if subcommand: + validate_subcommand(commands.keys(), subcommand) + print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") + help_text = commands[subcommand].get("help") + if help_text: + msg.text(f"\n{help_text}\n") + else: + print(f"\nAvailable commands in {PROJECT_FILE}") + print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:") + print(f"{COMMAND} project run {project_loc}") + + +def run_commands( + commands: List[str] = tuple(), + variables: Dict[str, Any] = {}, + silent: bool = False, + dry: bool = False, +) -> None: + """Run a sequence of commands in a subprocess, in order. + + commands (List[str]): The string commands. + variables (Dict[str, Any]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + silent (bool): Don't print the commands. + dry (bool): Perform a dry run and don't execut anything. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = split_command(command) + # Not sure if this is needed or a good idea. Motivation: users may often + # use commands in their config that reference "python" and we want to + # make sure that it's always executing the same Python that spaCy is + # executed with and the pip in the same env, not some other Python/pip. + # Also ensures cross-compatibility if user 1 writes "python3" (because + # that's how it's set up on their system), and user 2 without the + # shortcut tries to re-run the command. + if len(command) and command[0] in ("python", "python3"): + command[0] = sys.executable + elif len(command) and command[0] in ("pip", "pip3"): + command = [sys.executable, "-m", "pip", *command[1:]] + if not silent: + print(f"Running command: {join_command(command)}") + if not dry: + run_command(command) + + +def validate_subcommand( + commands: Sequence[str], workflows: Sequence[str], subcommand: str +) -> None: + """Check that a subcommand is valid and defined. Raises an error otherwise. + + commands (Sequence[str]): The available commands. + subcommand (str): The subcommand. + """ + if not commands and not workflows: + msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1) + if subcommand not in commands and subcommand not in workflows: + help_msg = [] + if commands: + help_msg.append(f"Available commands: {', '.join(commands)}") + if workflows: + help_msg.append(f"Available workflows: {', '.join(workflows)}") + msg.fail( + f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}", + ". ".join(help_msg), + exits=1, + ) + + +def check_rerun( + project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] +) -> bool: + """Check if a command should be rerun because its settings or inputs/outputs + changed. + + project_dir (Path): The current project directory. + command (Dict[str, Any]): The command, as defined in the project.yml. + variables (Dict[str, Any]): The variables defined in the project.yml. + RETURNS (bool): Whether to re-run the command. + """ + lock_path = project_dir / PROJECT_LOCK + if not lock_path.exists(): # We don't have a lockfile, run command + return True + data = srsly.read_yaml(lock_path) + if command["name"] not in data: # We don't have info about this command + return True + entry = data[command["name"]] + # If the entry in the lockfile matches the lockfile entry that would be + # generated from the current command, we don't rerun because it means that + # all inputs/outputs, hashes and scripts are the same and nothing changed + return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry) + + +def update_lockfile( + project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] +) -> None: + """Update the lockfile after running a command. Will create a lockfile if + it doesn't yet exist and will add an entry for the current command, its + script and dependencies/outputs. + + project_dir (Path): The current project directory. + command (Dict[str, Any]): The command, as defined in the project.yml. + variables (Dict[str, Any]): The variables defined in the project.yml. + """ + lock_path = project_dir / PROJECT_LOCK + if not lock_path.exists(): + srsly.write_yaml(lock_path, {}) + data = {} + else: + data = srsly.read_yaml(lock_path) + data[command["name"]] = get_lock_entry(project_dir, command, variables) + srsly.write_yaml(lock_path, data) + + +def get_lock_entry( + project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] +) -> Dict[str, Any]: + """Get a lockfile entry for a given command. An entry includes the command, + the script (command steps) and a list of dependencies and outputs with + their paths and file hashes, if available. The format is based on the + dvc.lock files, to keep things consistent. + + project_dir (Path): The current project directory. + command (Dict[str, Any]): The command, as defined in the project.yml. + variables (Dict[str, Any]): The variables defined in the project.yml. + RETURNS (Dict[str, Any]): The lockfile entry. + """ + deps = get_fileinfo(project_dir, command.get("deps", []), variables) + outs = get_fileinfo(project_dir, command.get("outputs", []), variables) + outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables) + return { + "cmd": f"{COMMAND} run {command['name']}", + "script": command["script"], + "deps": deps, + "outs": [*outs, *outs_nc], + } + + +def get_fileinfo( + project_dir: Path, paths: List[str], variables: Dict[str, Any] +) -> List[Dict[str, str]]: + """Generate the file information for a list of paths (dependencies, outputs). + Includes the file path and the file's checksum. + + project_dir (Path): The current project directory. + paths (List[str]): The file paths. + variables (Dict[str, Any]): The variables defined in the project.yml. + RETURNS (List[Dict[str, str]]): The lockfile entry for a file. + """ + data = [] + for path in paths: + path = path.format(**variables) + file_path = project_dir / path + md5 = get_checksum(file_path) if file_path.exists() else None + data.append({"path": path, "md5": md5}) + return data diff --git a/spacy/cli/project/util.py b/spacy/cli/project/util.py new file mode 100644 index 000000000..5f2dc59ee --- /dev/null +++ b/spacy/cli/project/util.py @@ -0,0 +1,57 @@ +from typing import Dict, Any +from pathlib import Path +from wasabi import msg +import srsly + +from ...schemas import ProjectConfigSchema, validate + + +PROJECT_FILE = "project.yml" +PROJECT_LOCK = "project.lock" + + +def load_project_config(path: Path) -> Dict[str, Any]: + """Load the project.yml file from a directory and validate it. + + path (Path): The path to the project directory. + RETURNS (Dict[str, Any]): The loaded project.yml. + """ + config_path = path / PROJECT_FILE + if not config_path.exists(): + msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) + invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." + try: + config = srsly.read_yaml(config_path) + except ValueError as e: + msg.fail(invalid_err, e, exits=1) + errors = validate(ProjectConfigSchema, config) + if errors: + msg.fail(invalid_err, "\n".join(errors), exits=1) + validate_project_commands(config) + return config + + +def validate_project_commands(config: Dict[str, Any]) -> None: + """Check that project commands and workflows are valid, don't contain + duplicates, don't clash and only refer to commands that exist. + + config (Dict[str, Any]): The loaded config. + """ + command_names = [cmd["name"] for cmd in config.get("commands", [])] + workflows = config.get("workflows", {}) + duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) + if duplicates: + err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" + msg.fail(err, exits=1) + for workflow_name, workflow_steps in workflows.items(): + if workflow_name in command_names: + err = f"Can't use workflow name '{workflow_name}': name already exists as a command" + msg.fail(err, exits=1) + for step in workflow_steps: + if step not in command_names: + msg.fail( + f"Unknown command specified in workflow '{workflow_name}': {step}", + f"Workflows can only refer to commands defined in the 'commands' " + f"section of the {PROJECT_FILE}.", + exits=1, + ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3b71cdb9a..bda3c9ca2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -203,7 +203,8 @@ def train( msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") train_examples = list( corpus.train_dataset( - nlp, shuffle=False, gold_preproc=training["gold_preproc"] + nlp, shuffle=False, gold_preproc=training["gold_preproc"], + max_length=training["max_length"] ) ) nlp.begin_training(lambda: train_examples) @@ -306,11 +307,18 @@ def create_train_batches(nlp, corpus, cfg): if len(train_examples) == 0: raise ValueError(Errors.E988) epoch += 1 - batches = util.minibatch_by_words( - train_examples, - size=cfg["batch_size"], - discard_oversize=cfg["discard_oversize"], - ) + if cfg.get("batch_by_words", True): + batches = util.minibatch_by_words( + train_examples, + size=cfg["batch_size"], + discard_oversize=cfg["discard_oversize"], + ) + else: + batches = util.minibatch( + train_examples, + size=cfg["batch_size"], + ) + # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop try: first = next(batches) diff --git a/spacy/errors.py b/spacy/errors.py index 31533e7e2..5a4e0d0c7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,15 +477,14 @@ class Errors(object): E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E969 = ("Expected string values for field '{field}', but received {types} instead. ") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " "array and {doc_length} for the Doc itself.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E973 = ("Unexpected type for NER data") E974 = ("Unknown {obj} attribute: {key}") - E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " - "but got {type}") - E976 = ("The method 'Example.from_dict' expects a dict as second argument, " + E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, " "but received None.") E977 = ("Can not compare a MorphAnalysis with a string object. " "This is likely a bug in spaCy, so feel free to open an issue.") diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index 9416bdd81..c8b5fc44d 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -1,6 +1,6 @@ from .corpus import Corpus from .example import Example -from .align import align +from .align import Alignment from .iob_utils import iob_to_biluo, biluo_to_iob from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags diff --git a/spacy/gold/align.pxd b/spacy/gold/align.pxd deleted file mode 100644 index ea3615863..000000000 --- a/spacy/gold/align.pxd +++ /dev/null @@ -1,8 +0,0 @@ -cdef class Alignment: - cdef public object cost - cdef public object i2j - cdef public object j2i - cdef public object i2j_multi - cdef public object j2i_multi - cdef public object cand_to_gold - cdef public object gold_to_cand diff --git a/spacy/gold/align.py b/spacy/gold/align.py new file mode 100644 index 000000000..0dd48d4cf --- /dev/null +++ b/spacy/gold/align.py @@ -0,0 +1,30 @@ +from typing import List +import numpy +from thinc.types import Ragged +from dataclasses import dataclass +import tokenizations + + +@dataclass +class Alignment: + x2y: Ragged + y2x: Ragged + + @classmethod + def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment": + x2y = _make_ragged(x2y) + y2x = _make_ragged(y2x) + return Alignment(x2y=x2y, y2x=y2x) + + @classmethod + def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": + x2y, y2x = tokenizations.get_alignments(A, B) + return Alignment.from_indices(x2y=x2y, y2x=y2x) + + +def _make_ragged(indices): + lengths = numpy.array([len(x) for x in indices], dtype="i") + flat = [] + for x in indices: + flat.extend(x) + return Ragged(numpy.array(flat, dtype="i"), lengths) diff --git a/spacy/gold/align.pyx b/spacy/gold/align.pyx deleted file mode 100644 index 80ba0346a..000000000 --- a/spacy/gold/align.pyx +++ /dev/null @@ -1,101 +0,0 @@ -import numpy -from ..errors import Errors, AlignmentError - - -cdef class Alignment: - def __init__(self, spacy_words, gold_words): - # Do many-to-one alignment for misaligned tokens. - # If we over-segment, we'll have one gold word that covers a sequence - # of predicted words - # If we under-segment, we'll have one predicted word that covers a - # sequence of gold words. - # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that - # except for NER spans where the start and end can be aligned. - cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words) - self.cost = cost - self.i2j = i2j - self.j2i = j2i - self.i2j_multi = i2j_multi - self.j2i_multi = j2i_multi - self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] - self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - - -def align(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - tokens_a = _normalize_for_alignment(tokens_a) - tokens_b = _normalize_for_alignment(tokens_b) - cost = 0 - a2b = numpy.empty(len(tokens_a), dtype="i") - b2a = numpy.empty(len(tokens_b), dtype="i") - a2b.fill(-1) - b2a.fill(-1) - a2b_multi = {} - b2a_multi = {} - i = 0 - j = 0 - offset_a = 0 - offset_b = 0 - while i < len(tokens_a) and j < len(tokens_b): - a = tokens_a[i][offset_a:] - b = tokens_b[j][offset_b:] - if a == b: - if offset_a == offset_b == 0: - a2b[i] = j - b2a[j] = i - elif offset_a == 0: - cost += 2 - a2b_multi[i] = j - elif offset_b == 0: - cost += 2 - b2a_multi[j] = i - offset_a = offset_b = 0 - i += 1 - j += 1 - elif a == "": - assert offset_a == 0 - cost += 1 - i += 1 - elif b == "": - assert offset_b == 0 - cost += 1 - j += 1 - elif b.startswith(a): - cost += 1 - if offset_a == 0: - a2b_multi[i] = j - i += 1 - offset_a = 0 - offset_b += len(a) - elif a.startswith(b): - cost += 1 - if offset_b == 0: - b2a_multi[j] = i - j += 1 - offset_b = 0 - offset_a += len(b) - else: - assert "".join(tokens_a) != "".join(tokens_b) - raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b)) - return cost, a2b, b2a, a2b_multi, b2a_multi - - -def _normalize_for_alignment(tokens): - return [w.replace(" ", "").lower() for w in tokens] diff --git a/spacy/gold/example.pxd b/spacy/gold/example.pxd index 736969ecd..1f63b12d0 100644 --- a/spacy/gold/example.pxd +++ b/spacy/gold/example.pxd @@ -1,8 +1,7 @@ from ..tokens.doc cimport Doc -from .align cimport Alignment cdef class Example: cdef readonly Doc x cdef readonly Doc y - cdef readonly Alignment _alignment + cdef readonly object _alignment diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 7b629dcd2..f5b9f0eeb 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -6,10 +6,9 @@ from ..tokens.doc cimport Doc from ..tokens.span cimport Span from ..tokens.span import Span from ..attrs import IDS -from .align cimport Alignment +from .align import Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import spans_from_biluo_tags -from .align import Alignment from ..errors import Errors, Warnings from ..syntax import nonproj @@ -28,8 +27,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cdef class Example: - def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None): - """ Doc can either be text, or an actual Doc """ + def __init__(self, Doc predicted, Doc reference, *, alignment=None): if predicted is None: raise TypeError(Errors.E972.format(arg="predicted")) if reference is None: @@ -60,17 +58,15 @@ cdef class Example: @classmethod def from_dict(cls, Doc predicted, dict example_dict): + if predicted is None: + raise ValueError(Errors.E976.format(n="first", type="Doc")) if example_dict is None: - raise ValueError(Errors.E976) - if not isinstance(predicted, Doc): - raise TypeError(Errors.E975.format(type=type(predicted))) + raise ValueError(Errors.E976.format(n="second", type="dict")) example_dict = _fix_legacy_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict) if "ORTH" not in tok_dict: tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] - if not _has_field(tok_dict, "SPACY"): - spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) return Example( predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) @@ -83,34 +79,38 @@ cdef class Example: gold_words = [token.orth_ for token in self.reference] if gold_words == []: gold_words = spacy_words - self._alignment = Alignment(spacy_words, gold_words) + self._alignment = Alignment.from_strings(spacy_words, gold_words) return self._alignment def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" - i2j_multi = self.alignment.i2j_multi - cand_to_gold = self.alignment.cand_to_gold + align = self.alignment.x2y vocab = self.reference.vocab gold_values = self.reference.to_array([field]) output = [None] * len(self.predicted) - for i, gold_i in enumerate(cand_to_gold): - if self.predicted[i].text.isspace(): - output[i] = None - if gold_i is None: - if i in i2j_multi: - output[i] = gold_values[i2j_multi[i]] - else: - output[i] = None + for token in self.predicted: + if token.is_space: + output[token.i] = None else: - output[i] = gold_values[gold_i] + values = gold_values[align[token.i].dataXd] + values = values.ravel() + if len(values) == 0: + output[token.i] = None + elif len(values) == 1: + output[token.i] = values[0] + elif len(set(list(values))) == 1: + # If all aligned tokens have the same value, use it. + output[token.i] = values[0] + else: + output[token.i] = None if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] return output def get_aligned_parse(self, projectivize=True): - cand_to_gold = self.alignment.cand_to_gold - gold_to_cand = self.alignment.gold_to_cand + cand_to_gold = self.alignment.x2y + gold_to_cand = self.alignment.y2x aligned_heads = [None] * self.x.length aligned_deps = [None] * self.x.length heads = [token.head.i for token in self.y] @@ -118,52 +118,51 @@ cdef class Example: if projectivize: heads, deps = nonproj.projectivize(heads, deps) for cand_i in range(self.x.length): - gold_i = cand_to_gold[cand_i] - if gold_i is not None: # Alignment found - gold_head = gold_to_cand[heads[gold_i]] - if gold_head is not None: - aligned_heads[cand_i] = gold_head + if cand_to_gold.lengths[cand_i] == 1: + gold_i = cand_to_gold[cand_i].dataXd[0, 0] + if gold_to_cand.lengths[heads[gold_i]] == 1: + aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0]) aligned_deps[cand_i] = deps[gold_i] return aligned_heads, aligned_deps + def get_aligned_spans_x2y(self, x_spans): + return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y) + + def get_aligned_spans_y2x(self, y_spans): + return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x) + + def _get_aligned_spans(self, doc, spans, align): + seen = set() + output = [] + for span in spans: + indices = align[span.start : span.end].data.ravel() + indices = [idx for idx in indices if idx not in seen] + if len(indices) >= 1: + aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label) + target_text = span.text.lower().strip().replace(" ", "") + our_text = aligned_span.text.lower().strip().replace(" ", "") + if our_text == target_text: + output.append(aligned_span) + seen.update(indices) + return output + def get_aligned_ner(self): if not self.y.is_nered: return [None] * len(self.x) # should this be 'missing' instead of 'None' ? - x_text = self.x.text - # Get a list of entities, and make spans for non-entity tokens. - # We then work through the spans in order, trying to find them in - # the text and using that to get the offset. Any token that doesn't - # get a tag set this way is tagged None. - # This could maybe be improved? It at least feels easy to reason about. - y_spans = list(self.y.ents) - y_spans.sort() - x_text_offset = 0 - x_spans = [] - for y_span in y_spans: - if x_text.count(y_span.text) >= 1: - start_char = x_text.index(y_span.text) + x_text_offset - end_char = start_char + len(y_span.text) - x_span = self.x.char_span(start_char, end_char, label=y_span.label) - if x_span is not None: - x_spans.append(x_span) - x_text = self.x.text[end_char:] - x_text_offset = end_char + x_ents = self.get_aligned_spans_y2x(self.y.ents) + # Default to 'None' for missing values x_tags = biluo_tags_from_offsets( self.x, - [(e.start_char, e.end_char, e.label_) for e in x_spans], + [(e.start_char, e.end_char, e.label_) for e in x_ents], missing=None ) - gold_to_cand = self.alignment.gold_to_cand - for token in self.y: - if token.ent_iob_ == "O": - cand_i = gold_to_cand[token.i] - if cand_i is not None and x_tags[cand_i] is None: - x_tags[cand_i] = "O" - i2j_multi = self.alignment.i2j_multi - for i, tag in enumerate(x_tags): - if tag is None and i in i2j_multi: - gold_i = i2j_multi[i] - if gold_i is not None and self.y[gold_i].ent_iob_ == "O": + # Now fill the tokens we can align to O. + O = 2 # I=1, O=2, B=3 + for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")): + if x_tags[i] is None: + if ent_iob == O: + x_tags[i] = "O" + elif self.x[i].is_space: x_tags[i] = "O" return x_tags @@ -194,25 +193,22 @@ cdef class Example: links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0} return links - def split_sents(self): """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" if not self.reference.is_sentenced: return [self] - - sent_starts = self.get_aligned("SENT_START") - sent_starts.append(1) # appending virtual start of a next sentence to facilitate search - + + align = self.alignment.y2x + seen_indices = set() output = [] - pred_start = 0 - for sent in self.reference.sents: - new_ref = sent.as_doc() - pred_end = sent_starts.index(1, pred_start+1) # find where the next sentence starts - new_pred = self.predicted[pred_start : pred_end].as_doc() - output.append(Example(new_pred, new_ref)) - pred_start = pred_end - + for y_sent in self.reference.sents: + indices = align[y_sent.start : y_sent.end].data.ravel() + indices = [idx for idx in indices if idx not in seen_indices] + if indices: + x_sent = self.predicted[indices[0] : indices[-1] + 1] + output.append(Example(x_sent.as_doc(), y_sent.as_doc())) + seen_indices.update(indices) return output property text: @@ -258,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([vocab.morphology.add(v) for v in value]) else: attrs.append(key) - values.append([vocab.strings.add(v) for v in value]) + try: + values.append([vocab.strings.add(v) for v in value]) + except TypeError: + types= set([type(v) for v in value]) + raise TypeError(Errors.E969.format(field=key, types=types)) array = numpy.asarray(values, dtype="uint64") return attrs, array.T diff --git a/spacy/language.py b/spacy/language.py index da45c058c..a95b6d279 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -540,19 +540,15 @@ class Language(object): if component_cfg is None: component_cfg = {} - component_deps = count_pipeline_interdependencies(self.pipeline) - # Determine whether component should set annotations. In theory I guess - # we should do this by inspecting the meta? Or we could just always - # say "yes" for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) component_cfg[name].setdefault("drop", drop) - component_cfg[name]["set_annotations"] = bool(component_deps[i]) + component_cfg[name].setdefault("set_annotations", False) for name, proc in self.pipeline: if not hasattr(proc, "update"): continue proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) - if sgd is not False: + if sgd not in (None, False): for name, proc in self.pipeline: if hasattr(proc, "model"): proc.model.finish_update(sgd) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index 3b5f09e7b..a3e2633e9 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -1,13 +1,14 @@ from thinc.api import Model, normal_init -def PrecomputableAffine(nO, nI, nF, nP): +def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): model = Model( "precomputable_affine", forward, init=init, dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, params={"W": None, "b": None, "pad": None}, + attrs={"dropout_rate": dropout} ) return model @@ -48,17 +49,14 @@ def forward(model, X, is_train): model.inc_grad("b", dY.sum(axis=0)) dY = dY.reshape((dY.shape[0], nO * nP)) - Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3))) + Wopfi = W.transpose((1, 2, 0, 3)) Wopfi = Wopfi.reshape((nO * nP, nF * nI)) dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) - # Reuse the buffer - dWopfi = Wopfi - dWopfi.fill(0.0) - model.ops.gemm(dY, Xf, out=dWopfi, trans1=True) + dWopfi = model.ops.gemm(dY, Xf, trans1=True) dWopfi = dWopfi.reshape((nO, nP, nF, nI)) # (o, p, f, i) --> (f, o, p, i) - dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3))) + dWopfi = dWopfi.transpose((2, 0, 1, 3)) model.inc_grad("W", dWopfi) return dXf.reshape((dXf.shape[0], nF, nI)) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index d2b70c36e..f1a9c7d1f 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -263,20 +263,20 @@ def build_Tok2Vec_model( cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): norm = HashEmbed( - nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout, + nO=width, nV=embed_size, column=cols.index(NORM), dropout=None, seed=0 ) if subword_features: prefix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout, + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None, seed=1 ) suffix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout, + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None, seed=2 ) shape = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout, + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None, seed=3 ) else: @@ -296,7 +296,7 @@ def build_Tok2Vec_model( >> Maxout( nO=width, nI=width * columns, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ), @@ -309,7 +309,7 @@ def build_Tok2Vec_model( >> Maxout( nO=width, nI=width * columns, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ), @@ -322,7 +322,7 @@ def build_Tok2Vec_model( >> Maxout( nO=width, nI=width * columns, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ), @@ -335,7 +335,7 @@ def build_Tok2Vec_model( reduce_dimensions = Maxout( nO=width, nI=nM * nC + width, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 88f27f0bf..39d4b0a14 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear from ..syntax._parser_model import ParserStepModel -def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): +def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()): """Set up a stepwise transition-based model""" if upper is None: has_upper = False diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 78e8e17c0..a3aa8be22 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -272,7 +272,7 @@ cdef class Morphology: @staticmethod def feats_to_dict(feats): - if not feats: + if not feats or feats == Morphology.EMPTY_MORPH: return {} return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index f792d57b0..57b778434 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -3,7 +3,7 @@ cimport numpy as np import numpy import srsly -from thinc.api import to_categorical +from thinc.api import SequenceCategoricalCrossentropy from ..tokens.doc cimport Doc from ..vocab cimport Vocab @@ -85,13 +85,10 @@ class Morphologizer(Tagger): doc.is_morphed = True def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = {tag: i for i, tag in enumerate(self.labels)} - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + truths = [] for eg in examples: + eg_truths = [] pos_tags = eg.get_aligned("POS", as_string=True) morphs = eg.get_aligned("MORPH", as_string=True) for i in range(len(morphs)): @@ -104,20 +101,11 @@ class Morphologizer(Tagger): morph = self.vocab.strings[self.vocab.morphology.add(feats)] if morph == "": morph = Morphology.EMPTY_MORPH - if morph is None: - correct[idx] = guesses[idx] - elif morph in tag_index: - correct[idx] = tag_index[morph] - else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [eg.predicted for eg in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + eg_truths.append(morph) + truths.append(eg_truths) + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def to_bytes(self, exclude=tuple()): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 61cf155a2..86c768e9b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -334,7 +334,7 @@ class Tagger(Pipe): losses[self.name] += (gradient**2).sum() def get_loss(self, examples, scores): - loss_func = SequenceCategoricalCrossentropy(names=self.labels) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) truths = [eg.get_aligned("tag", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): @@ -521,29 +521,23 @@ class SentenceRecognizer(Tagger): doc.c[j].sent_start = -1 def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = range(len(self.labels)) - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + labels = self.labels + loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) + truths = [] for eg in examples: - sent_starts = eg.get_aligned("sent_start") - for sent_start in sent_starts: - if sent_start is None: - correct[idx] = guesses[idx] - elif sent_start in tag_index: - correct[idx] = sent_start + eg_truth = [] + for x in eg.get_aligned("sent_start"): + if x == None: + eg_truth.append(None) + elif x == 1: + eg_truth.append(labels[1]) else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [eg.predicted for eg in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + # anything other than 1: 0, -1, -1 as uint64 + eg_truth.append(labels[0]) + truths.append(eg_truth) + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, diff --git a/spacy/schemas.py b/spacy/schemas.py index 38e08b4cb..b7307b5b2 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -222,7 +222,7 @@ class TrainingSchema(BaseModel): class ProjectConfigAsset(BaseModel): # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") - url: StrictStr = Field(..., title="URL of asset") + url: Optional[StrictStr] = Field(None, title="URL of asset") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") # fmt: on @@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel): # fmt: off variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") assets: List[ProjectConfigAsset] = Field([], title="Data assets") - run: List[StrictStr] = Field([], title="Names of project commands to execute, in order") + workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") # fmt: on diff --git a/spacy/scorer.py b/spacy/scorer.py index 87033d234..6fc86e412 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -326,10 +326,11 @@ class Scorer(object): for token in doc: if token.orth_.isspace(): continue - gold_i = align.cand_to_gold[token.i] - if gold_i is None: + if align.x2y.lengths[token.i] != 1: self.tokens.fp += 1 + gold_i = None else: + gold_i = align.x2y[token.i].dataXd[0, 0] self.tokens.tp += 1 cand_tags.add((gold_i, token.tag_)) cand_pos.add((gold_i, token.pos_)) @@ -345,7 +346,10 @@ class Scorer(object): if token.is_sent_start: cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): - gold_head = align.cand_to_gold[token.head.i] + if align.x2y.lengths[token.head.i] == 1: + gold_head = align.x2y[token.head.i].dataXd[0, 0] + else: + gold_head = None # None is indistinct, so we can't just add it to the set # Multiple (None, None) deps are possible if gold_i is None or gold_head is None: @@ -381,15 +385,9 @@ class Scorer(object): gold_ents.add(gold_ent) gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1)) cand_per_ents = {ent_label: set() for ent_label in ent_labels} - for ent in doc.ents: - first = align.cand_to_gold[ent.start] - last = align.cand_to_gold[ent.end - 1] - if first is None or last is None: - self.ner.fp += 1 - self.ner_per_ents[ent.label_].fp += 1 - else: - cand_ents.add((ent.label_, first, last)) - cand_per_ents[ent.label_].add((ent.label_, first, last)) + for ent in example.get_aligned_spans_x2y(doc.ents): + cand_ents.add((ent.label_, ent.start, ent.end - 1)) + cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1)) # Scores per ent for k, v in self.ner_per_ents.items(): if k in cand_per_ents: diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 853facdc6..42baa737b 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserStepModel(Model): - def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True): + def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True, + dropout=0.1): Model.__init__(self, name="parser_step_model", forward=step_forward) self.attrs["has_upper"] = has_upper + self.attrs["dropout_rate"] = dropout self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) if layers[1].get_dim("nP") >= 2: activation = "maxout" @@ -289,11 +291,17 @@ class ParserStepModel(Model): self.bp_tokvecs(d_tokvecs[:-1]) return d_tokvecs +NUMPY_OPS = NumpyOps() def step_forward(model: ParserStepModel, states, is_train): token_ids = model.get_token_ids(states) vector, get_d_tokvecs = model.state2vec(token_ids, is_train) + mask = None if model.attrs["has_upper"]: + dropout_rate = model.attrs["dropout_rate"] + if is_train and dropout_rate > 0: + mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1) + vector *= mask scores, get_d_vector = model.vec2scores(vector, is_train) else: scores = NumpyOps().asarray(vector) @@ -305,6 +313,8 @@ def step_forward(model: ParserStepModel, states, is_train): # Zero vectors for unseen classes d_scores *= model._class_mask d_vector = get_d_vector(d_scores) + if mask is not None: + d_vector *= mask if isinstance(model.state2vec.ops, CupyOps) \ and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): # Move token_ids and d_vector to GPU, asynchronously @@ -437,7 +447,7 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector = state_vector + self.bias + state_vector += self.bias state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector_ids): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0295241c6..8bac8cd89 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -65,7 +65,6 @@ cdef class Parser: self.set_output(self.moves.n_moves) self.cfg = dict(cfg) self.cfg.setdefault("update_with_oracle_cut_size", 100) - self.cfg.setdefault("normalize_gradients_with_batch_size", True) self._multitasks = [] for multitask in cfg.get("multitasks", []): self.add_multitask_objective(multitask) @@ -280,11 +279,12 @@ cdef class Parser: [eg.predicted for eg in examples]) if self.cfg["update_with_oracle_cut_size"] >= 1: # Chop sequences into lengths of this many transitions, to make the - # batch uniform length. We randomize this to overfit less. + # batch uniform length. + # We used to randomize this, but it's not clear that actually helps? cut_size = self.cfg["update_with_oracle_cut_size"] states, golds, max_steps = self._init_gold_batch( examples, - max_length=numpy.random.choice(range(5, cut_size)) + max_length=cut_size ) else: states, golds, _ = self.moves.init_gold_batch(examples) @@ -292,24 +292,15 @@ cdef class Parser: if not states: return losses all_states = list(states) - states_golds = zip(states, golds) - for _ in range(max_steps): - if not states_golds: - break + states_golds = list(zip(states, golds)) + while states_golds: states, golds = zip(*states_golds) scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) - if self.cfg["normalize_gradients_with_batch_size"]: - # We have to be very careful how we do this, because of the way we - # cut up the batch. We subdivide long sequences. If we normalize - # naively, we end up normalizing by sequence length, which - # is bad: that would mean that states in long sequences - # consistently get smaller gradients. Imagine if we have two - # sequences, one length 1000, one length 20. If we cut up - # the 1k sequence so that we have a "batch" of 50 subsequences, - # we don't want the gradients to get 50 times smaller! - d_scores /= n_examples - + # Note that the gradient isn't normalized by the batch size + # here, because our "samples" are really the states...But we + # can't normalize by the number of states either, as then we'd + # be getting smaller gradients for states in long sequences. backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) @@ -407,6 +398,7 @@ cdef class Parser: cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) c_d_scores += d_scores.shape[1] + # Note that we don't normalize this. See comment in update() for why. if losses is not None: losses.setdefault(self.name, 0.) losses[self.name] += (d_scores**2).sum() @@ -525,21 +517,25 @@ cdef class Parser: StateClass state Transition action all_states = self.moves.init_batch([eg.predicted for eg in examples]) + states = [] + golds = [] kept = [] max_length_seen = 0 for state, eg in zip(all_states, examples): if self.moves.has_gold(eg) and not state.is_final(): gold = self.moves.init_gold(state, eg) - oracle_actions = self.moves.get_oracle_sequence_from_state( - state.copy(), gold) - kept.append((eg, state, gold, oracle_actions)) - min_length = min(min_length, len(oracle_actions)) - max_length_seen = max(max_length, len(oracle_actions)) + if len(eg.x) < max_length: + states.append(state) + golds.append(gold) + else: + oracle_actions = self.moves.get_oracle_sequence_from_state( + state.copy(), gold) + kept.append((eg, state, gold, oracle_actions)) + min_length = min(min_length, len(oracle_actions)) + max_length_seen = max(max_length, len(oracle_actions)) if not kept: - return [], [], 0 + return states, golds, 0 max_length = max(min_length, min(max_length, max_length_seen)) - states = [] - golds = [] cdef int clas max_moves = 0 for eg, state, gold, oracle_actions in kept: diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 86d9a0180..496ec7e03 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): assert contains_cycle(tree) is None - assert contains_cycle(cyclic_tree) == set([3, 4, 5]) + assert contains_cycle(cyclic_tree) == {3, 4, 5} assert contains_cycle(partial_tree) is None assert contains_cycle(multirooted_tree) is None diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index bfa1bd65a..82f536076 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -38,6 +38,11 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # add some cases where SENT_START == -1 + train_examples[0].reference[10].is_sent_start = False + train_examples[1].reference[1].is_sent_start = False + train_examples[1].reference[11].is_sent_start = False + nlp.add_pipe(senter) optimizer = nlp.begin_training() diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 67966f70e..8b998d216 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -23,6 +23,7 @@ def test_issue2070(): assert len(doc) == 11 +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() @@ -134,6 +135,7 @@ def test_issue2464(en_vocab): assert len(matches) == 3 +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 5d504a9c6..768ae33fe 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls): assert doc[0].like_num +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2800(): """Test issue that arises when too many labels are added to NER model. Used to cause segfault. """ nlp = English() train_data = [] - train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]) + train_data.extend( + [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] + ) entity_types = [str(i) for i in range(1000)] ner = nlp.create_pipe("ner") nlp.add_pipe(ner) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 1aceba68f..1d5bfcb92 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -88,6 +88,7 @@ def test_issue3199(): assert list(doc[0:3].noun_chunks) == [] +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py new file mode 100644 index 000000000..5e2ee902c --- /dev/null +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -0,0 +1,472 @@ +import pytest +from spacy.language import Language +from spacy.vocab import Vocab +from spacy.pipeline import EntityRuler, DependencyParser +from spacy.pipeline.defaults import default_parser +from spacy import displacy, load +from spacy.displacy import parse_deps +from spacy.tokens import Doc, Token +from spacy.matcher import Matcher, PhraseMatcher +from spacy.errors import MatchPatternError +from spacy.util import minibatch +from spacy.gold import Example +from spacy.lang.hi import Hindi +from spacy.lang.es import Spanish +from spacy.lang.en import English +from spacy.attrs import IS_ALPHA +from thinc.api import compounding +import spacy +import srsly +import numpy + +from ..util import make_tempdir, get_doc + + +@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms + assert tok.is_stop + + +def test_issue_3526_1(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler_bytes = ruler.to_bytes() + assert len(ruler) == len(patterns) + assert len(ruler.labels) == 4 + assert ruler.overwrite + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(ruler_bytes) + assert len(new_ruler) == len(ruler) + assert len(new_ruler.labels) == 4 + assert new_ruler.overwrite == ruler.overwrite + assert new_ruler.ent_id_sep == ruler.ent_id_sep + + +def test_issue_3526_2(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + bytes_old_style = srsly.msgpack_dumps(ruler.patterns) + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(bytes_old_style) + assert len(new_ruler) == len(ruler) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert new_ruler.overwrite is not ruler.overwrite + + +def test_issue_3526_3(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + with make_tempdir() as tmpdir: + out_file = tmpdir / "entity_ruler" + srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) + new_ruler = EntityRuler(nlp).from_disk(out_file) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert len(new_ruler) == len(ruler) + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue_3526_4(en_vocab): + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, overwrite_ents=True) + ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) + nlp.add_pipe(ruler) + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + ruler = nlp.get_pipe("entity_ruler") + assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert ruler.overwrite is True + nlp2 = load(tmpdir) + new_ruler = nlp2.get_pipe("entity_ruler") + assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert new_ruler.overwrite is True + + +def test_issue3531(): + """Test that displaCy renderer doesn't require "settings" key.""" + example_dep = { + "words": [ + {"text": "But", "tag": "CCONJ"}, + {"text": "Google", "tag": "PROPN"}, + {"text": "is", "tag": "VERB"}, + {"text": "starting", "tag": "VERB"}, + {"text": "from", "tag": "ADP"}, + {"text": "behind.", "tag": "ADV"}, + ], + "arcs": [ + {"start": 0, "end": 3, "label": "cc", "dir": "left"}, + {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "aux", "dir": "left"}, + {"start": 3, "end": 4, "label": "prep", "dir": "right"}, + {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, + ], + } + example_ent = { + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG"}], + } + dep_html = displacy.render(example_dep, style="dep", manual=True) + assert dep_html + ent_html = displacy.render(example_ent, style="ent", manual=True) + assert ent_html + + +def test_issue3540(en_vocab): + words = ["I", "live", "in", "NewYork", "right", "now"] + tensor = numpy.asarray( + [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], + dtype="f", + ) + doc = Doc(en_vocab, words=words) + doc.tensor = tensor + gold_text = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_1 = [token.vector for token in doc] + assert len(vectors_1) == len(doc) + + with doc.retokenize() as retokenizer: + heads = [(doc[3], 1), doc[2]] + attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} + retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) + + gold_text = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_2 = [token.vector for token in doc] + assert len(vectors_2) == len(doc) + assert vectors_1[0].tolist() == vectors_2[0].tolist() + assert vectors_1[1].tolist() == vectors_2[1].tolist() + assert vectors_1[2].tolist() == vectors_2[2].tolist() + assert vectors_1[4].tolist() == vectors_2[5].tolist() + assert vectors_1[5].tolist() == vectors_2[6].tolist() + + +def test_issue3549(en_vocab): + """Test that match pattern validation doesn't raise on empty errors.""" + matcher = Matcher(en_vocab, validate=True) + pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] + matcher.add("GOOD", [pattern]) + with pytest.raises(MatchPatternError): + matcher.add("BAD", [[{"X": "Y"}]]) + + +@pytest.mark.xfail +def test_issue3555(en_vocab): + """Test that custom extensions with default None don't break matcher.""" + Token.set_extension("issue3555", default=None) + matcher = Matcher(en_vocab) + pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["have", "apple"]) + matcher(doc) + + +def test_issue3611(): + """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + textcat = nlp.create_pipe( + "textcat", + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, + ) + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.begin_training(X=x_train, Y=y_train) + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update( + examples=batch, sgd=optimizer, drop=0.1, losses=losses, + ) + + +def test_issue3625(): + """Test that default punctuation rules applies to hindi unicode characters""" + nlp = Hindi() + doc = nlp("hi. how हुए. होटल, होटल") + expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] + assert [token.text for token in doc] == expected + + +def test_issue3803(): + """Test that spanish num-like tokens have True for like_num attribute.""" + nlp = Spanish() + text = "2 dos 1000 mil 12 doce" + doc = nlp(text) + + assert [t.like_num for t in doc] == [True, True, True, True, True, True] + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3830_no_subtok(): + """Test that the parser doesn't have subtok label if not learn_tokens""" + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + parser = DependencyParser(Vocab(), default_parser(), **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.begin_training(lambda: []) + assert "subtok" not in parser.labels + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3830_with_subtok(): + """Test that the parser does have subtok label if learn_tokens=True.""" + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + parser = DependencyParser(Vocab(), default_parser(), **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.begin_training(lambda: []) + assert "subtok" in parser.labels + + +def test_issue3839(en_vocab): + """Test that match IDs returned by the matcher are correct, are in the string """ + doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) + matcher = Matcher(en_vocab) + match_id = "PATTERN" + pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] + pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] + matcher.add(match_id, [pattern1]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + matcher = Matcher(en_vocab) + matcher.add(match_id, [pattern2]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + + +@pytest.mark.parametrize( + "sentence", + [ + "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", + "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", + "It was a missed assignment, but it shouldn't have resulted in a turnover ...", + ], +) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + doc = nlp(sentence) + count = 0 + for token in doc: + count += token.is_alpha + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + +def test_issue3879(en_vocab): + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + assert len(doc) == 5 + pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] + matcher = Matcher(en_vocab) + matcher.add("TEST", [pattern]) + assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3880(): + """Test that `nlp.pipe()` works when an empty string ends the batch. + + Fixed in v7.0.5 of Thinc. + """ + texts = ["hello", "world", "", ""] + nlp = English() + nlp.add_pipe(nlp.create_pipe("parser")) + nlp.add_pipe(nlp.create_pipe("ner")) + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.get_pipe("parser").add_label("dep") + nlp.get_pipe("ner").add_label("PERSON") + nlp.get_pipe("tagger").add_label("NN") + nlp.begin_training() + for doc in nlp.pipe(texts): + pass + + +def test_issue3882(en_vocab): + """Test that displaCy doesn't serialize the doc.user_data when making a + copy of the Doc. + """ + doc = Doc(en_vocab, words=["Hello", "world"]) + doc.is_parsed = True + doc.user_data["test"] = set() + parse_deps(doc) + + +def test_issue3951(en_vocab): + """Test that combinations of optional rules are matched correctly.""" + matcher = Matcher(en_vocab) + pattern = [ + {"LOWER": "hello"}, + {"LOWER": "this", "OP": "?"}, + {"OP": "?"}, + {"LOWER": "world"}, + ] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) + matches = matcher(doc) + assert len(matches) == 0 + + +def test_issue3959(): + """ Ensure that a modified pos attribute is serialized correctly.""" + nlp = English() + doc = nlp( + "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" + ) + assert doc[0].pos_ == "" + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + with make_tempdir() as tmp_dir: + file_path = tmp_dir / "my_doc" + doc.to_disk(file_path) + doc2 = nlp("") + doc2.from_disk(file_path) + assert doc2[0].pos_ == "NOUN" + + +def test_issue3962(en_vocab): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] + heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] + deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = doc[1:5] # "jests at scars ," + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "dep" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" # head set to the new artificial root + assert doc2[3].dep_ == "dep" + # We should still have 1 sentence + assert len(list(doc2.sents)) == 1 + span3 = doc[6:9] # "never felt a" + doc3 = span3.as_doc() + doc3_json = doc3.to_json() + assert doc3_json + assert doc3[0].head.text == "felt" + assert doc3[0].dep_ == "neg" + assert doc3[1].head.text == "felt" + assert doc3[1].dep_ == "ROOT" + assert doc3[2].head.text == "felt" # head set to ancestor + assert doc3[2].dep_ == "dep" + # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" + assert len(list(doc3.sents)) == 1 + + +def test_issue3962_long(en_vocab): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] + heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] + deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = two_sent_doc[1:7] # "jests at scars. They never" + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root (in sentence 1) + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "ROOT" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" + assert doc2[3].dep_ == "punct" + # head set to itself, being the new artificial root (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # head set to the new artificial head (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # We should still have 2 sentences + sents = list(doc2.sents) + assert len(sents) == 2 + assert sents[0].text == "jests at scars ." + assert sents[1].text == "They never" + + +def test_issue3972(en_vocab): + """Test that the PhraseMatcher returns duplicates for duplicate match IDs. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) + matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) + doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) + matches = matcher(doc) + + assert len(matches) == 2 + + # We should have a match for each of the two rules + found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] + assert "A" in found_ids + assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py deleted file mode 100644 index 3d8ee9922..000000000 --- a/spacy/tests/regression/test_issue3521.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - - -@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) -def test_issue3521(en_tokenizer, word): - tok = en_tokenizer(word)[1] - # 'not' and 'would' should be stopwords, also in their abbreviated forms - assert tok.is_stop diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py deleted file mode 100644 index aa77028fb..000000000 --- a/spacy/tests/regression/test_issue3526.py +++ /dev/null @@ -1,85 +0,0 @@ -import pytest -from spacy.tokens import Span -from spacy.language import Language -from spacy.pipeline import EntityRuler -from spacy import load -import srsly - -from ..util import make_tempdir - - -@pytest.fixture -def patterns(): - return [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - - -@pytest.fixture -def add_ent(): - def add_ent_component(doc): - doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])] - return doc - - return add_ent_component - - -def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - ruler_bytes = ruler.to_bytes() - assert len(ruler) == len(patterns) - assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(ruler_bytes) - assert len(new_ruler) == len(ruler) - assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite - - -def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, overwrite_ents=True) - - ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) - nlp.add_pipe(ruler) - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - ruler = nlp.get_pipe("entity_ruler") - assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True - nlp2 = load(tmpdir) - new_ruler = nlp2.get_pipe("entity_ruler") - assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py deleted file mode 100644 index 4c65a5bfe..000000000 --- a/spacy/tests/regression/test_issue3531.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy import displacy - - -def test_issue3531(): - """Test that displaCy renderer doesn't require "settings" key.""" - example_dep = { - "words": [ - {"text": "But", "tag": "CCONJ"}, - {"text": "Google", "tag": "PROPN"}, - {"text": "is", "tag": "VERB"}, - {"text": "starting", "tag": "VERB"}, - {"text": "from", "tag": "ADP"}, - {"text": "behind.", "tag": "ADV"}, - ], - "arcs": [ - {"start": 0, "end": 3, "label": "cc", "dir": "left"}, - {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, - {"start": 2, "end": 3, "label": "aux", "dir": "left"}, - {"start": 3, "end": 4, "label": "prep", "dir": "right"}, - {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, - ], - } - example_ent = { - "text": "But Google is starting from behind.", - "ents": [{"start": 4, "end": 10, "label": "ORG"}], - } - dep_html = displacy.render(example_dep, style="dep", manual=True) - assert dep_html - ent_html = displacy.render(example_ent, style="ent", manual=True) - assert ent_html diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py deleted file mode 100644 index be9e04b0b..000000000 --- a/spacy/tests/regression/test_issue3540.py +++ /dev/null @@ -1,44 +0,0 @@ -from spacy.tokens import Doc - -import numpy as np - - -def test_issue3540(en_vocab): - - words = ["I", "live", "in", "NewYork", "right", "now"] - tensor = np.asarray( - [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], - dtype="f", - ) - doc = Doc(en_vocab, words=words) - doc.tensor = tensor - - gold_text = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.text for token in doc] == gold_text - - gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - - vectors_1 = [token.vector for token in doc] - assert len(vectors_1) == len(doc) - - with doc.retokenize() as retokenizer: - heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} - retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) - - gold_text = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.text for token in doc] == gold_text - - gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - - vectors_2 = [token.vector for token in doc] - assert len(vectors_2) == len(doc) - - assert vectors_1[0].tolist() == vectors_2[0].tolist() - assert vectors_1[1].tolist() == vectors_2[1].tolist() - assert vectors_1[2].tolist() == vectors_2[2].tolist() - - assert vectors_1[4].tolist() == vectors_2[5].tolist() - assert vectors_1[5].tolist() == vectors_2[6].tolist() diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py deleted file mode 100644 index b3af59c2e..000000000 --- a/spacy/tests/regression/test_issue3549.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest -from spacy.matcher import Matcher -from spacy.errors import MatchPatternError - - -def test_issue3549(en_vocab): - """Test that match pattern validation doesn't raise on empty errors.""" - matcher = Matcher(en_vocab, validate=True) - pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] - matcher.add("GOOD", [pattern]) - with pytest.raises(MatchPatternError): - matcher.add("BAD", [[{"X": "Y"}]]) diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py deleted file mode 100644 index de047bcbc..000000000 --- a/spacy/tests/regression/test_issue3555.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest -from spacy.tokens import Doc, Token -from spacy.matcher import Matcher - - -@pytest.mark.xfail -def test_issue3555(en_vocab): - """Test that custom extensions with default None don't break matcher.""" - Token.set_extension("issue3555", default=None) - matcher = Matcher(en_vocab) - pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["have", "apple"]) - matcher(doc) diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py deleted file mode 100644 index ef189c446..000000000 --- a/spacy/tests/regression/test_issue3611.py +++ /dev/null @@ -1,45 +0,0 @@ -import spacy -from spacy.util import minibatch -from thinc.api import compounding -from spacy.gold import Example - - -def test_issue3611(): - """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - - nlp = spacy.blank("en") - - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - - # add a text categorizer component - textcat = nlp.create_pipe( - "textcat", - config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, - ) - - for label in unique_classes: - textcat.add_label(label) - nlp.add_pipe(textcat, last=True) - - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training(X=x_train, Y=y_train) - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py deleted file mode 100644 index 51561b3ac..000000000 --- a/spacy/tests/regression/test_issue3625.py +++ /dev/null @@ -1,9 +0,0 @@ -from spacy.lang.hi import Hindi - - -def test_issue3625(): - """Test that default punctuation rules applies to hindi unicode characters""" - nlp = Hindi() - doc = nlp("hi. how हुए. होटल, होटल") - expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] - assert [token.text for token in doc] == expected diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py deleted file mode 100644 index ab5250edf..000000000 --- a/spacy/tests/regression/test_issue3803.py +++ /dev/null @@ -1,10 +0,0 @@ -from spacy.lang.es import Spanish - - -def test_issue3803(): - """Test that spanish num-like tokens have True for like_num attribute.""" - nlp = Spanish() - text = "2 dos 1000 mil 12 doce" - doc = nlp(text) - - assert [t.like_num for t in doc] == [True, True, True, True, True, True] diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py deleted file mode 100644 index 06b7893a7..000000000 --- a/spacy/tests/regression/test_issue3830.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.pipeline.pipes import DependencyParser -from spacy.vocab import Vocab - -from spacy.pipeline.defaults import default_parser - - -def test_issue3830_no_subtok(): - """Test that the parser doesn't have subtok label if not learn_tokens""" - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - parser = DependencyParser(Vocab(), default_parser(), **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.begin_training(lambda: []) - assert "subtok" not in parser.labels - - -def test_issue3830_with_subtok(): - """Test that the parser does have subtok label if learn_tokens=True.""" - config = { - "learn_tokens": True, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - parser = DependencyParser(Vocab(), default_parser(), **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.begin_training(lambda: []) - assert "subtok" in parser.labels diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py deleted file mode 100644 index 27b1f5f29..000000000 --- a/spacy/tests/regression/test_issue3839.py +++ /dev/null @@ -1,18 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string """ - doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) - matcher = Matcher(en_vocab) - match_id = "PATTERN" - pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] - pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] - matcher.add(match_id, [pattern1]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - matcher = Matcher(en_vocab) - matcher.add(match_id, [pattern2]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py deleted file mode 100644 index 0a851e869..000000000 --- a/spacy/tests/regression/test_issue3869.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -from spacy.attrs import IS_ALPHA -from spacy.lang.en import English - - -@pytest.mark.parametrize( - "sentence", - [ - "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", - "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", - "It was a missed assignment, but it shouldn't have resulted in a turnover ...", - ], -) -def test_issue3869(sentence): - """Test that the Doc's count_by function works consistently""" - nlp = English() - doc = nlp(sentence) - - count = 0 - for token in doc: - count += token.is_alpha - - assert count == doc.count_by(IS_ALPHA).get(1, 0) diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py deleted file mode 100644 index 8500c09aa..000000000 --- a/spacy/tests/regression/test_issue3879.py +++ /dev/null @@ -1,11 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3879(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) - assert len(doc) == 5 - pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] - matcher = Matcher(en_vocab) - matcher.add("TEST", [pattern]) - assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py deleted file mode 100644 index 6e8ab6f43..000000000 --- a/spacy/tests/regression/test_issue3880.py +++ /dev/null @@ -1,21 +0,0 @@ -from spacy.lang.en import English -import pytest - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_issue3880(): - """Test that `nlp.pipe()` works when an empty string ends the batch. - - Fixed in v7.0.5 of Thinc. - """ - texts = ["hello", "world", "", ""] - nlp = English() - nlp.add_pipe(nlp.create_pipe("parser")) - nlp.add_pipe(nlp.create_pipe("ner")) - nlp.add_pipe(nlp.create_pipe("tagger")) - nlp.get_pipe("parser").add_label("dep") - nlp.get_pipe("ner").add_label("PERSON") - nlp.get_pipe("tagger").add_label("NN") - nlp.begin_training() - for doc in nlp.pipe(texts): - pass diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py deleted file mode 100644 index fa616db1d..000000000 --- a/spacy/tests/regression/test_issue3882.py +++ /dev/null @@ -1,12 +0,0 @@ -from spacy.displacy import parse_deps -from spacy.tokens import Doc - - -def test_issue3882(en_vocab): - """Test that displaCy doesn't serialize the doc.user_data when making a - copy of the Doc. - """ - doc = Doc(en_vocab, words=["Hello", "world"]) - doc.is_parsed = True - doc.user_data["test"] = set() - parse_deps(doc) diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py deleted file mode 100644 index 6e4c9eeaa..000000000 --- a/spacy/tests/regression/test_issue3951.py +++ /dev/null @@ -1,17 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3951(en_vocab): - """Test that combinations of optional rules are matched correctly.""" - matcher = Matcher(en_vocab) - pattern = [ - {"LOWER": "hello"}, - {"LOWER": "this", "OP": "?"}, - {"OP": "?"}, - {"LOWER": "world"}, - ] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) - matches = matcher(doc) - assert len(matches) == 0 diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py deleted file mode 100644 index 7db28a31f..000000000 --- a/spacy/tests/regression/test_issue3959.py +++ /dev/null @@ -1,26 +0,0 @@ -from spacy.lang.en import English -from ..util import make_tempdir - - -def test_issue3959(): - """ Ensure that a modified pos attribute is serialized correctly.""" - nlp = English() - doc = nlp( - "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" - ) - assert doc[0].pos_ == "" - - doc[0].pos_ = "NOUN" - assert doc[0].pos_ == "NOUN" - - # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True - - with make_tempdir() as tmp_dir: - file_path = tmp_dir / "my_doc" - doc.to_disk(file_path) - - doc2 = nlp("") - doc2.from_disk(file_path) - - assert doc2[0].pos_ == "NOUN" diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py deleted file mode 100644 index 971c9b08e..000000000 --- a/spacy/tests/regression/test_issue3962.py +++ /dev/null @@ -1,117 +0,0 @@ -import pytest - -from ..util import get_doc - - -@pytest.fixture -def doc(en_tokenizer): - text = "He jests at scars, that never felt a wound." - heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] - deps = [ - "nsubj", - "ccomp", - "prep", - "pobj", - "punct", - "nsubj", - "neg", - "ROOT", - "det", - "dobj", - "punct", - ] - tokens = en_tokenizer(text) - return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - - -def test_issue3962(doc): - """ Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - span2 = doc[1:5] # "jests at scars ," - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - - assert ( - doc2[0].head.text == "jests" - ) # head set to itself, being the new artificial root - assert doc2[0].dep_ == "dep" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" # head set to the new artificial root - assert doc2[3].dep_ == "dep" - - # We should still have 1 sentence - assert len(list(doc2.sents)) == 1 - - span3 = doc[6:9] # "never felt a" - doc3 = span3.as_doc() - doc3_json = doc3.to_json() - assert doc3_json - - assert doc3[0].head.text == "felt" - assert doc3[0].dep_ == "neg" - assert doc3[1].head.text == "felt" - assert doc3[1].dep_ == "ROOT" - assert doc3[2].head.text == "felt" # head set to ancestor - assert doc3[2].dep_ == "dep" - - # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" - assert len(list(doc3.sents)) == 1 - - -@pytest.fixture -def two_sent_doc(en_tokenizer): - text = "He jests at scars. They never felt a wound." - heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] - deps = [ - "nsubj", - "ROOT", - "prep", - "pobj", - "punct", - "nsubj", - "neg", - "ROOT", - "det", - "dobj", - "punct", - ] - tokens = en_tokenizer(text) - return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - - -def test_issue3962_long(two_sent_doc): - """ Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - span2 = two_sent_doc[1:7] # "jests at scars. They never" - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - - assert ( - doc2[0].head.text == "jests" - ) # head set to itself, being the new artificial root (in sentence 1) - assert doc2[0].dep_ == "ROOT" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" - assert doc2[3].dep_ == "punct" - assert ( - doc2[4].head.text == "They" - ) # head set to itself, being the new artificial root (in sentence 2) - assert doc2[4].dep_ == "dep" - assert ( - doc2[4].head.text == "They" - ) # head set to the new artificial head (in sentence 2) - assert doc2[4].dep_ == "dep" - - # We should still have 2 sentences - sents = list(doc2.sents) - assert len(sents) == 2 - assert sents[0].text == "jests at scars ." - assert sents[1].text == "They never" diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py deleted file mode 100644 index fe5388950..000000000 --- a/spacy/tests/regression/test_issue3972.py +++ /dev/null @@ -1,19 +0,0 @@ -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -def test_issue3972(en_vocab): - """Test that the PhraseMatcher returns duplicates for duplicate match IDs. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) - matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) - doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) - matches = matcher(doc) - - assert len(matches) == 2 - - # We should have a match for each of the two rules - found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] - assert "A" in found_ids - assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py new file mode 100644 index 000000000..2981c6428 --- /dev/null +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -0,0 +1,469 @@ +import pytest +from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe +from spacy.pipeline.defaults import default_ner +from spacy.matcher import PhraseMatcher, Matcher +from spacy.tokens import Doc, Span, DocBin +from spacy.gold import Example, Corpus +from spacy.gold.converters import json2docs +from spacy.vocab import Vocab +from spacy.lang.en import English +from spacy.util import minibatch, ensure_path, load_model +from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex +from spacy.tokenizer import Tokenizer +from spacy.lang.el import Greek +from spacy.language import Language +import spacy +from thinc.api import compounding +from collections import defaultdict + +from ..util import make_tempdir + + +def test_issue4002(en_vocab): + """Test that the PhraseMatcher can match on overwritten NORM attributes. + """ + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern1 = Doc(en_vocab, words=["c", "d"]) + assert [t.norm_ for t in pattern1] == ["c", "d"] + matcher.add("TEST", [pattern1]) + doc = Doc(en_vocab, words=["a", "b", "c", "d"]) + assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] + matches = matcher(doc) + assert len(matches) == 1 + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern2 = Doc(en_vocab, words=["1", "2"]) + pattern2[0].norm_ = "c" + pattern2[1].norm_ = "d" + assert [t.norm_ for t in pattern2] == ["c", "d"] + matcher.add("TEST", [pattern2]) + matches = matcher(doc) + assert len(matches) == 1 + + +def test_issue4030(): + """ Test whether textcat works fine with empty doc """ + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + textcat = nlp.create_pipe( + "textcat", + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, + ) + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.begin_training() + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update( + examples=batch, sgd=optimizer, drop=0.1, losses=losses, + ) + # processing of an empty doc should result in 0.0 for all categories + doc = nlp("") + assert doc.cats["offensive"] == 0.0 + assert doc.cats["inoffensive"] == 0.0 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4042(): + """Test that serialization of an EntityRuler before NER works fine.""" + nlp = English() + + # add ner pipe + ner = nlp.create_pipe("ner") + ner.add_label("SOME_LABEL") + nlp.add_pipe(ner) + nlp.begin_training() + + # Add entity ruler + ruler = EntityRuler(nlp) + patterns = [ + {"label": "MY_ORG", "pattern": "Apple"}, + {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, + ] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler, before="ner") # works fine with "after" + doc1 = nlp("What do you think about Apple ?") + assert doc1.ents[0].label_ == "MY_ORG" + + with make_tempdir() as d: + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + + nlp2 = load_model(output_dir) + doc2 = nlp2("What do you think about Apple ?") + assert doc2.ents[0].label_ == "MY_ORG" + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4042_bug2(): + """ + Test that serialization of an NER works fine when new labels were added. + This is the second bug of two bugs underlying the issue 4042. + """ + nlp1 = English() + vocab = nlp1.vocab + + # add ner pipe + ner1 = nlp1.create_pipe("ner") + ner1.add_label("SOME_LABEL") + nlp1.add_pipe(ner1) + nlp1.begin_training() + + # add a new label to the doc + doc1 = nlp1("What do you think about Apple ?") + assert len(ner1.labels) == 1 + assert "SOME_LABEL" in ner1.labels + apple_ent = Span(doc1, 5, 6, label="MY_ORG") + doc1.ents = list(doc1.ents) + [apple_ent] + + # reapply the NER - at this point it should resize itself + ner1(doc1) + assert len(ner1.labels) == 2 + assert "SOME_LABEL" in ner1.labels + assert "MY_ORG" in ner1.labels + + with make_tempdir() as d: + # assert IO goes fine + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + ner1.to_disk(output_dir) + + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + ner2 = EntityRecognizer(vocab, default_ner(), **config) + ner2.from_disk(output_dir) + assert len(ner2.labels) == 2 + + +def test_issue4054(en_vocab): + """Test that a new blank model can be made with a vocab from file, + and that serialization does not drop the language at any point.""" + nlp1 = English() + vocab1 = nlp1.vocab + with make_tempdir() as d: + vocab_dir = ensure_path(d / "vocab") + if not vocab_dir.exists(): + vocab_dir.mkdir() + vocab1.to_disk(vocab_dir) + vocab2 = Vocab().from_disk(vocab_dir) + print("lang", vocab2.lang) + nlp2 = spacy.blank("en", vocab=vocab2) + nlp_dir = ensure_path(d / "nlp") + if not nlp_dir.exists(): + nlp_dir.mkdir() + nlp2.to_disk(nlp_dir) + nlp3 = load_model(nlp_dir) + assert nlp3.lang == "en" + + +def test_issue4120(en_vocab): + """Test that matches without a final {OP: ?} token are returned.""" + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) + doc1 = Doc(en_vocab, words=["a"]) + assert len(matcher(doc1)) == 1 # works + doc2 = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc2)) == 2 # fixed + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) + doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc3)) == 2 # works + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) + doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc4)) == 3 # fixed + + +def test_issue4133(en_vocab): + nlp = English() + vocab_bytes = nlp.vocab.to_bytes() + words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] + pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] + doc = Doc(en_vocab, words=words) + for i, token in enumerate(doc): + token.pos_ = pos[i] + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + doc_bytes = doc.to_bytes() + vocab = Vocab() + vocab = vocab.from_bytes(vocab_bytes) + doc = Doc(vocab).from_bytes(doc_bytes) + actual = [] + for token in doc: + actual.append(token.pos_) + assert actual == pos + + +def test_issue4190(): + def customize_tokenizer(nlp): + prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) + suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) + infix_re = compile_infix_regex(nlp.Defaults.infixes) + # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') + exceptions = { + k: v + for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() + if not (len(k) == 2 and k[1] == ".") + } + new_tokenizer = Tokenizer( + nlp.vocab, + exceptions, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=nlp.tokenizer.token_match, + ) + nlp.tokenizer = new_tokenizer + + test_string = "Test c." + # Load default language + nlp_1 = English() + doc_1a = nlp_1(test_string) + result_1a = [token.text for token in doc_1a] # noqa: F841 + # Modify tokenizer + customize_tokenizer(nlp_1) + doc_1b = nlp_1(test_string) + result_1b = [token.text for token in doc_1b] + # Save and Reload + with make_tempdir() as model_dir: + nlp_1.to_disk(model_dir) + nlp_2 = load_model(model_dir) + # This should be the modified tokenizer + doc_2 = nlp_2(test_string) + result_2 = [token.text for token in doc_2] + assert result_1b == result_2 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4267(): + """ Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PEOPLE") + nlp.add_pipe(ner) + nlp.begin_training() + assert "ner" in nlp.pipe_names + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.is_nered + for token in doc1: + assert token.ent_iob == 2 + # add entity ruler and run again + ruler = EntityRuler(nlp) + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.is_nered + for token in doc2: + assert token.ent_iob == 2 + + +def test_issue4272(): + """Test that lookup table can be accessed from Token.lemma if no POS tags + are available.""" + nlp = Greek() + doc = nlp("Χθες") + assert doc[0].lemma_ + + +def test_multiple_predictions(): + class DummyPipe(Pipe): + def __init__(self): + self.model = "dummy_model" + + def predict(self, docs): + return ([1, 2, 3], [4, 5, 6]) + + def set_annotations(self, docs, scores, tensors=None): + return docs + + nlp = Language() + doc = nlp.make_doc("foo") + dummy_pipe = DummyPipe() + dummy_pipe(doc) + + +@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor") +def test_issue4313(): + """ This should not crash or exit with some strange error code """ + beam_width = 16 + beam_density = 0.0001 + nlp = English() + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + ner = EntityRecognizer(nlp.vocab, default_ner(), **config) + ner.add_label("SOME_LABEL") + ner.begin_training([]) + nlp.add_pipe(ner) + + # add a new label to the doc + doc = nlp("What do you think about Apple ?") + assert len(ner.labels) == 1 + assert "SOME_LABEL" in ner.labels + apple_ent = Span(doc, 5, 6, label="MY_ORG") + doc.ents = list(doc.ents) + [apple_ent] + + # ensure the beam_parse still works with the new label + docs = [doc] + beams = nlp.entity.beam_parse( + docs, beam_width=beam_width, beam_density=beam_density + ) + + for doc, beam in zip(docs, beams): + entity_scores = defaultdict(float) + for score, ents in nlp.entity.moves.get_beam_parses(beam): + for start, end, label in ents: + entity_scores[(start, end, label)] += score + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4348(): + """Test that training the tagger with empty data, doesn't throw errors""" + nlp = English() + example = Example.from_dict(nlp.make_doc(""), {"tags": []}) + TRAIN_DATA = [example, example] + tagger = nlp.create_pipe("tagger") + nlp.add_pipe(tagger) + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + +def test_issue4367(): + """Test that docbin init goes well""" + DocBin() + DocBin(attrs=["LEMMA"]) + DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) + + +def test_issue4373(): + """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" + matcher = Matcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + matcher = PhraseMatcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + + +def test_issue4402(): + json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], + } + nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] + with make_tempdir() as tmpdir: + output_file = tmpdir / "test4402.spacy" + docs = json2docs([json_data]) + data = DocBin(docs=docs, attrs=attrs).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) + + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 + + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py deleted file mode 100644 index 3ac26d3ab..000000000 --- a/spacy/tests/regression/test_issue4002.py +++ /dev/null @@ -1,23 +0,0 @@ -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -def test_issue4002(en_vocab): - """Test that the PhraseMatcher can match on overwritten NORM attributes. - """ - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern1 = Doc(en_vocab, words=["c", "d"]) - assert [t.norm_ for t in pattern1] == ["c", "d"] - matcher.add("TEST", [pattern1]) - doc = Doc(en_vocab, words=["a", "b", "c", "d"]) - assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] - matches = matcher(doc) - assert len(matches) == 1 - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern2 = Doc(en_vocab, words=["1", "2"]) - pattern2[0].norm_ = "c" - pattern2[1].norm_ = "d" - assert [t.norm_ for t in pattern2] == ["c", "d"] - matcher.add("TEST", [pattern2]) - matches = matcher(doc) - assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py deleted file mode 100644 index e40565501..000000000 --- a/spacy/tests/regression/test_issue4030.py +++ /dev/null @@ -1,50 +0,0 @@ -import spacy -from spacy.util import minibatch -from thinc.api import compounding -from spacy.gold import Example - - -def test_issue4030(): - """ Test whether textcat works fine with empty doc """ - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - - nlp = spacy.blank("en") - - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - - # add a text categorizer component - textcat = nlp.create_pipe( - "textcat", - config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, - ) - - for label in unique_classes: - textcat.add_label(label) - nlp.add_pipe(textcat, last=True) - - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) - - # processing of an empty doc should result in 0.0 for all categories - doc = nlp("") - assert doc.cats["offensive"] == 0.0 - assert doc.cats["inoffensive"] == 0.0 diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py deleted file mode 100644 index f47290b92..000000000 --- a/spacy/tests/regression/test_issue4042.py +++ /dev/null @@ -1,85 +0,0 @@ -import spacy -from spacy.pipeline import EntityRecognizer, EntityRuler -from spacy.lang.en import English -from spacy.tokens import Span -from spacy.util import ensure_path -from spacy.pipeline.defaults import default_ner - -from ..util import make_tempdir - - -def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" - nlp = English() - - # add ner pipe - ner = nlp.create_pipe("ner") - ner.add_label("SOME_LABEL") - nlp.add_pipe(ner) - nlp.begin_training() - - # Add entity ruler - ruler = EntityRuler(nlp) - patterns = [ - {"label": "MY_ORG", "pattern": "Apple"}, - {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, - ] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler, before="ner") # works fine with "after" - doc1 = nlp("What do you think about Apple ?") - assert doc1.ents[0].label_ == "MY_ORG" - - with make_tempdir() as d: - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - - nlp2 = spacy.load(output_dir) - doc2 = nlp2("What do you think about Apple ?") - assert doc2.ents[0].label_ == "MY_ORG" - - -def test_issue4042_bug2(): - """ - Test that serialization of an NER works fine when new labels were added. - This is the second bug of two bugs underlying the issue 4042. - """ - nlp1 = English() - vocab = nlp1.vocab - - # add ner pipe - ner1 = nlp1.create_pipe("ner") - ner1.add_label("SOME_LABEL") - nlp1.add_pipe(ner1) - nlp1.begin_training() - - # add a new label to the doc - doc1 = nlp1("What do you think about Apple ?") - assert len(ner1.labels) == 1 - assert "SOME_LABEL" in ner1.labels - apple_ent = Span(doc1, 5, 6, label="MY_ORG") - doc1.ents = list(doc1.ents) + [apple_ent] - - # reapply the NER - at this point it should resize itself - ner1(doc1) - assert len(ner1.labels) == 2 - assert "SOME_LABEL" in ner1.labels - assert "MY_ORG" in ner1.labels - - with make_tempdir() as d: - # assert IO goes fine - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - ner1.to_disk(output_dir) - - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - ner2 = EntityRecognizer(vocab, default_ner(), **config) - ner2.from_disk(output_dir) - assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py deleted file mode 100644 index c52ded395..000000000 --- a/spacy/tests/regression/test_issue4054.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy.vocab import Vocab -import spacy -from spacy.lang.en import English -from spacy.util import ensure_path - -from ..util import make_tempdir - - -def test_issue4054(en_vocab): - """Test that a new blank model can be made with a vocab from file, - and that serialization does not drop the language at any point.""" - nlp1 = English() - vocab1 = nlp1.vocab - - with make_tempdir() as d: - vocab_dir = ensure_path(d / "vocab") - if not vocab_dir.exists(): - vocab_dir.mkdir() - vocab1.to_disk(vocab_dir) - - vocab2 = Vocab().from_disk(vocab_dir) - print("lang", vocab2.lang) - nlp2 = spacy.blank("en", vocab=vocab2) - - nlp_dir = ensure_path(d / "nlp") - if not nlp_dir.exists(): - nlp_dir.mkdir() - nlp2.to_disk(nlp_dir) - nlp3 = spacy.load(nlp_dir) - assert nlp3.lang == "en" diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py deleted file mode 100644 index 4849aa238..000000000 --- a/spacy/tests/regression/test_issue4120.py +++ /dev/null @@ -1,23 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue4120(en_vocab): - """Test that matches without a final {OP: ?} token are returned.""" - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) - doc1 = Doc(en_vocab, words=["a"]) - assert len(matcher(doc1)) == 1 # works - - doc2 = Doc(en_vocab, words=["a", "b", "c"]) - assert len(matcher(doc2)) == 2 # fixed - - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) - doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc3)) == 2 # works - - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) - doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc4)) == 3 # fixed diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py deleted file mode 100644 index a726806d7..000000000 --- a/spacy/tests/regression/test_issue4133.py +++ /dev/null @@ -1,28 +0,0 @@ -from spacy.lang.en import English -from spacy.tokens import Doc -from spacy.vocab import Vocab - - -def test_issue4133(en_vocab): - nlp = English() - vocab_bytes = nlp.vocab.to_bytes() - words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] - pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] - doc = Doc(en_vocab, words=words) - for i, token in enumerate(doc): - token.pos_ = pos[i] - - # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True - - doc_bytes = doc.to_bytes() - - vocab = Vocab() - vocab = vocab.from_bytes(vocab_bytes) - doc = Doc(vocab).from_bytes(doc_bytes) - - actual = [] - for token in doc: - actual.append(token.pos_) - - assert actual == pos diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py deleted file mode 100644 index 97d532d2a..000000000 --- a/spacy/tests/regression/test_issue4190.py +++ /dev/null @@ -1,46 +0,0 @@ -from spacy.lang.en import English -from spacy.tokenizer import Tokenizer -from spacy import util - -from ..util import make_tempdir - - -def test_issue4190(): - test_string = "Test c." - # Load default language - nlp_1 = English() - doc_1a = nlp_1(test_string) - result_1a = [token.text for token in doc_1a] # noqa: F841 - # Modify tokenizer - customize_tokenizer(nlp_1) - doc_1b = nlp_1(test_string) - result_1b = [token.text for token in doc_1b] - # Save and Reload - with make_tempdir() as model_dir: - nlp_1.to_disk(model_dir) - nlp_2 = util.load_model(model_dir) - # This should be the modified tokenizer - doc_2 = nlp_2(test_string) - result_2 = [token.text for token in doc_2] - assert result_1b == result_2 - - -def customize_tokenizer(nlp): - prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes) - infix_re = util.compile_infix_regex(nlp.Defaults.infixes) - # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') - exceptions = { - k: v - for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() - if not (len(k) == 2 and k[1] == ".") - } - new_tokenizer = Tokenizer( - nlp.vocab, - exceptions, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=nlp.tokenizer.token_match, - ) - nlp.tokenizer = new_tokenizer diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py deleted file mode 100644 index 891f03b30..000000000 --- a/spacy/tests/regression/test_issue4267.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - - -def test_issue4267(): - """ Test that running an entity_ruler after ner gives consistent results""" - nlp = English() - ner = nlp.create_pipe("ner") - ner.add_label("PEOPLE") - nlp.add_pipe(ner) - nlp.begin_training() - - assert "ner" in nlp.pipe_names - - # assert that we have correct IOB annotations - doc1 = nlp("hi") - assert doc1.is_nered - for token in doc1: - assert token.ent_iob == 2 - - # add entity ruler and run again - ruler = EntityRuler(nlp) - patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] - - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - assert "entity_ruler" in nlp.pipe_names - assert "ner" in nlp.pipe_names - - # assert that we still have correct IOB annotations - doc2 = nlp("hi") - assert doc2.is_nered - for token in doc2: - assert token.ent_iob == 2 diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py deleted file mode 100644 index 4bac97a44..000000000 --- a/spacy/tests/regression/test_issue4272.py +++ /dev/null @@ -1,9 +0,0 @@ -from spacy.lang.el import Greek - - -def test_issue4272(): - """Test that lookup table can be accessed from Token.lemma if no POS tags - are available.""" - nlp = Greek() - doc = nlp("Χθες") - assert doc[0].lemma_ diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py deleted file mode 100644 index ffbc41226..000000000 --- a/spacy/tests/regression/test_issue4278.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -from spacy.language import Language -from spacy.pipeline import Pipe - - -class DummyPipe(Pipe): - def __init__(self): - self.model = "dummy_model" - - def predict(self, docs): - return ([1, 2, 3], [4, 5, 6]) - - def set_annotations(self, docs, scores, tensors=None): - return docs - - -@pytest.fixture -def nlp(): - return Language() - - -def test_multiple_predictions(nlp): - doc = nlp.make_doc("foo") - dummy_pipe = DummyPipe() - dummy_pipe(doc) diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py deleted file mode 100644 index 3bddc26ca..000000000 --- a/spacy/tests/regression/test_issue4313.py +++ /dev/null @@ -1,47 +0,0 @@ -from collections import defaultdict - -import pytest - -from spacy.pipeline.defaults import default_ner -from spacy.pipeline import EntityRecognizer - -from spacy.lang.en import English -from spacy.tokens import Span - - -# skipped after removing Beam stuff during the Example/GoldParse refactor -@pytest.mark.skip -def test_issue4313(): - """ This should not crash or exit with some strange error code """ - beam_width = 16 - beam_density = 0.0001 - nlp = English() - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - ner = EntityRecognizer(nlp.vocab, default_ner(), **config) - ner.add_label("SOME_LABEL") - ner.begin_training([]) - nlp.add_pipe(ner) - - # add a new label to the doc - doc = nlp("What do you think about Apple ?") - assert len(ner.labels) == 1 - assert "SOME_LABEL" in ner.labels - apple_ent = Span(doc, 5, 6, label="MY_ORG") - doc.ents = list(doc.ents) + [apple_ent] - - # ensure the beam_parse still works with the new label - docs = [doc] - beams = nlp.entity.beam_parse( - docs, beam_width=beam_width, beam_density=beam_density - ) - - for doc, beam in zip(docs, beams): - entity_scores = defaultdict(float) - for score, ents in nlp.entity.moves.get_beam_parses(beam): - for start, end, label in ents: - entity_scores[(start, end, label)] += score diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py deleted file mode 100644 index 06b03df24..000000000 --- a/spacy/tests/regression/test_issue4348.py +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.gold import Example -from spacy.lang.en import English -from spacy.util import minibatch -from thinc.api import compounding -import pytest - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_issue4348(): - """Test that training the tagger with empty data, doesn't throw errors""" - - nlp = English() - example = Example.from_dict(nlp.make_doc(""), {"tags": []}) - TRAIN_DATA = [example, example] - - tagger = nlp.create_pipe("tagger") - nlp.add_pipe(tagger) - - optimizer = nlp.begin_training() - for i in range(5): - losses = {} - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py deleted file mode 100644 index 917847a05..000000000 --- a/spacy/tests/regression/test_issue4367.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.tokens import DocBin - - -def test_issue4367(): - """Test that docbin init goes well""" - DocBin() - DocBin(attrs=["LEMMA"]) - DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py deleted file mode 100644 index dbde1624e..000000000 --- a/spacy/tests/regression/test_issue4373.py +++ /dev/null @@ -1,10 +0,0 @@ -from spacy.matcher import Matcher, PhraseMatcher -from spacy.vocab import Vocab - - -def test_issue4373(): - """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" - matcher = Matcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - matcher = PhraseMatcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py deleted file mode 100644 index 9c596aaf6..000000000 --- a/spacy/tests/regression/test_issue4402.py +++ /dev/null @@ -1,98 +0,0 @@ -from spacy.gold import Corpus -from spacy.lang.en import English - -from ..util import make_tempdir -from ...gold.converters import json2docs -from ...tokens import DocBin - - -def test_issue4402(): - nlp = English() - attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] - with make_tempdir() as tmpdir: - output_file = tmpdir / "test4402.spacy" - docs = json2docs([json_data]) - data = DocBin(docs=docs, attrs=attrs).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) - corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - - train_data = list(corpus.train_dataset(nlp)) - assert len(train_data) == 2 - - split_train_data = [] - for eg in train_data: - split_train_data.extend(eg.split_sents()) - assert len(split_train_data) == 4 - - -json_data = { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], -} diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py new file mode 100644 index 000000000..01d7a1dbb --- /dev/null +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -0,0 +1,288 @@ +import pytest +from mock import Mock +from spacy.pipeline import EntityRuler +from spacy.matcher import DependencyMatcher +from spacy.tokens import Doc, Span, DocBin +from spacy.gold import Example +from spacy.gold.converters.conllu2docs import conllu2docs +from spacy.lang.en import English +from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab +from spacy.language import Language +from spacy.util import ensure_path, load_model_from_path +import numpy +import pickle + +from ..util import get_doc, make_tempdir + + +def test_issue4528(en_vocab): + """Test that user_data is correctly serialized in DocBin.""" + doc = Doc(en_vocab, words=["hello", "world"]) + doc.user_data["foo"] = "bar" + # This is how extension attribute values are stored in the user data + doc.user_data[("._.", "foo", None, None)] = "bar" + doc_bin = DocBin(store_user_data=True) + doc_bin.add(doc) + doc_bin_bytes = doc_bin.to_bytes() + new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) + new_doc = list(new_doc_bin.get_docs(en_vocab))[0] + assert new_doc.user_data["foo"] == "bar" + assert new_doc.user_data[("._.", "foo", None, None)] == "bar" + + +@pytest.mark.parametrize( + "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] +) +def test_gold_misaligned(en_tokenizer, text, words): + doc = en_tokenizer(text) + Example.from_dict(doc, {"words": words}) + + +def test_issue4590(en_vocab): + """Test that matches param in on_match method are the same as matches run with no on_match method""" + pattern = [ + {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + ] + + on_match = Mock() + matcher = DependencyMatcher(en_vocab) + matcher.add("pattern", on_match, pattern) + text = "The quick brown fox jumped over the lazy fox" + heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] + doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) + matches = matcher(doc) + on_match_args = on_match.call_args + assert on_match_args[0][3] == matches + + +def test_issue4651_with_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +def test_issue4651_without_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + not specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +def test_issue4665(): + """ + conllu2json should not raise an exception if the HEAD column contains an + underscore + """ + input_data = """ +1 [ _ PUNCT -LRB- _ _ punct _ _ +2 This _ DET DT _ _ det _ _ +3 killing _ NOUN NN _ _ nsubj _ _ +4 of _ ADP IN _ _ case _ _ +5 a _ DET DT _ _ det _ _ +6 respected _ ADJ JJ _ _ amod _ _ +7 cleric _ NOUN NN _ _ nmod _ _ +8 will _ AUX MD _ _ aux _ _ +9 be _ AUX VB _ _ aux _ _ +10 causing _ VERB VBG _ _ root _ _ +11 us _ PRON PRP _ _ iobj _ _ +12 trouble _ NOUN NN _ _ dobj _ _ +13 for _ ADP IN _ _ case _ _ +14 years _ NOUN NNS _ _ nmod _ _ +15 to _ PART TO _ _ mark _ _ +16 come _ VERB VB _ _ acl _ _ +17 . _ PUNCT . _ _ punct _ _ +18 ] _ PUNCT -RRB- _ _ punct _ _ +""" + conllu2docs(input_data) + + +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) + assert kb.get_size_entities() == 1 + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.dump(str(file_path)) + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + kb2.load_bulk(str(file_path)) + assert kb2.get_size_entities() == 1 + + +def test_issue4707(): + """Tests that disabled component names are also excluded from nlp.from_disk + by default when loading a model. + """ + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(nlp.create_pipe("entity_ruler")) + assert nlp.pipe_names == ["sentencizer", "entity_ruler"] + exclude = ["tokenizer", "sentencizer"] + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir, exclude=exclude) + new_nlp = load_model_from_path(tmpdir, disable=exclude) + assert "sentencizer" not in new_nlp.pipe_names + assert "entity_ruler" in new_nlp.pipe_names + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4725_1(): + """ Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["min_action_freq"] == 342 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["min_action_freq"] == 342 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4725_2(): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + nlp.begin_training() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + + +def test_issue4849(): + nlp = English() + ruler = EntityRuler( + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, + ], + phrase_matcher_attr="LOWER", + ) + nlp.add_pipe(ruler) + text = """ + The left is starting to take aim at Democratic front-runner Joe Biden. + Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." + """ + # USING 1 PROCESS + count_ents = 0 + for doc in nlp.pipe([text], n_process=1): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + # USING 2 PROCESSES + count_ents = 0 + for doc in nlp.pipe([text], n_process=2): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + + +class CustomPipe: + name = "my_pipe" + + def __init__(self): + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +def test_issue4903(): + """Ensure that this runs correctly and doesn't hang or crash on Windows / + macOS.""" + nlp = English() + custom_component = CustomPipe() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(custom_component, after="sentencizer") + + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." + + +def test_issue4924(): + nlp = Language() + example = Example.from_dict(nlp.make_doc(""), {}) + nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py deleted file mode 100644 index 6f96c9f2d..000000000 --- a/spacy/tests/regression/test_issue4528.py +++ /dev/null @@ -1,16 +0,0 @@ -from spacy.tokens import Doc, DocBin - - -def test_issue4528(en_vocab): - """Test that user_data is correctly serialized in DocBin.""" - doc = Doc(en_vocab, words=["hello", "world"]) - doc.user_data["foo"] = "bar" - # This is how extension attribute values are stored in the user data - doc.user_data[("._.", "foo", None, None)] = "bar" - doc_bin = DocBin(store_user_data=True) - doc_bin.add(doc) - doc_bin_bytes = doc_bin.to_bytes() - new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) - new_doc = list(new_doc_bin.get_docs(en_vocab))[0] - assert new_doc.user_data["foo"] == "bar" - assert new_doc.user_data[("._.", "foo", None, None)] == "bar" diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py deleted file mode 100644 index 0708499de..000000000 --- a/spacy/tests/regression/test_issue4529.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest - -from spacy.gold import Example - - -@pytest.mark.parametrize( - "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] -) -def test_gold_misaligned(en_tokenizer, text, words): - doc = en_tokenizer(text) - Example.from_dict(doc, {"words": words}) diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py deleted file mode 100644 index fc49c5117..000000000 --- a/spacy/tests/regression/test_issue4590.py +++ /dev/null @@ -1,35 +0,0 @@ -from mock import Mock -from spacy.matcher import DependencyMatcher -from ..util import get_doc - - -def test_issue4590(en_vocab): - """Test that matches param in on_match method are the same as matches run with no on_match method""" - pattern = [ - {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - { - "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, - "PATTERN": {"ORTH": "fox"}, - }, - { - "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, - "PATTERN": {"ORTH": "fox"}, - }, - ] - - on_match = Mock() - - matcher = DependencyMatcher(en_vocab) - matcher.add("pattern", on_match, pattern) - - text = "The quick brown fox jumped over the lazy fox" - heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] - deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] - - doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) - - matches = matcher(doc) - - on_match_args = on_match.call_args - - assert on_match_args[0][3] == matches diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py deleted file mode 100644 index 3f6c1a57c..000000000 --- a/spacy/tests/regression/test_issue4651.py +++ /dev/null @@ -1,62 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - -from ..util import make_tempdir - - -def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - specified. - """ - text = "Spacy is a python library for nlp" - - nlp = English() - ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) - - nlp_reloaded.add_pipe(ruler_reloaded) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - - assert res == res_reloaded - - -def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - not specified. - """ - text = "Spacy is a python library for nlp" - - nlp = English() - ruler = EntityRuler(nlp) - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) - - nlp_reloaded.add_pipe(ruler_reloaded) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - - assert res == res_reloaded diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py deleted file mode 100644 index e28d0f44a..000000000 --- a/spacy/tests/regression/test_issue4665.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest - -# TODO -# from spacy.gold.converters.conllu2docs import conllu2docs - -input_data = """ -1 [ _ PUNCT -LRB- _ _ punct _ _ -2 This _ DET DT _ _ det _ _ -3 killing _ NOUN NN _ _ nsubj _ _ -4 of _ ADP IN _ _ case _ _ -5 a _ DET DT _ _ det _ _ -6 respected _ ADJ JJ _ _ amod _ _ -7 cleric _ NOUN NN _ _ nmod _ _ -8 will _ AUX MD _ _ aux _ _ -9 be _ AUX VB _ _ aux _ _ -10 causing _ VERB VBG _ _ root _ _ -11 us _ PRON PRP _ _ iobj _ _ -12 trouble _ NOUN NN _ _ dobj _ _ -13 for _ ADP IN _ _ case _ _ -14 years _ NOUN NNS _ _ nmod _ _ -15 to _ PART TO _ _ mark _ _ -16 come _ VERB VB _ _ acl _ _ -17 . _ PUNCT . _ _ punct _ _ -18 ] _ PUNCT -RRB- _ _ punct _ _ -""" - - -@pytest.mark.xfail -def test_issue4665(): - """ - conllu2json should not raise an exception if the HEAD column contains an - underscore - """ - pass - # conllu2json(input_data) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py deleted file mode 100644 index 149e1431b..000000000 --- a/spacy/tests/regression/test_issue4674.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest -from spacy.kb import KnowledgeBase -from spacy.util import ensure_path -from spacy.lang.en import English - -from ..util import make_tempdir - - -def test_issue4674(): - """Test that setting entities with overlapping identifiers does not mess up IO""" - nlp = English() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) - - vector1 = [0.9, 1.1, 1.01] - vector2 = [1.8, 2.25, 2.01] - with pytest.warns(UserWarning): - kb.set_entities( - entity_list=["Q1", "Q1"], - freq_list=[32, 111], - vector_list=[vector1, vector2], - ) - - assert kb.get_size_entities() == 1 - - # dumping to file & loading back in - with make_tempdir() as d: - dir_path = ensure_path(d) - if not dir_path.exists(): - dir_path.mkdir() - file_path = dir_path / "kb" - kb.dump(str(file_path)) - - kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) - kb2.load_bulk(str(file_path)) - - assert kb2.get_size_entities() == 1 diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py deleted file mode 100644 index d9798ef84..000000000 --- a/spacy/tests/regression/test_issue4707.py +++ /dev/null @@ -1,20 +0,0 @@ -from spacy.util import load_model_from_path -from spacy.lang.en import English - -from ..util import make_tempdir - - -def test_issue4707(): - """Tests that disabled component names are also excluded from nlp.from_disk - by default when loading a model. - """ - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(nlp.create_pipe("entity_ruler")) - assert nlp.pipe_names == ["sentencizer", "entity_ruler"] - exclude = ["tokenizer", "sentencizer"] - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir, exclude=exclude) - new_nlp = load_model_from_path(tmpdir, disable=exclude) - assert "sentencizer" not in new_nlp.pipe_names - assert "entity_ruler" in new_nlp.pipe_names diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py deleted file mode 100644 index cdc3c09ca..000000000 --- a/spacy/tests/regression/test_issue4725.py +++ /dev/null @@ -1,41 +0,0 @@ -import pickle -import numpy - -from spacy.lang.en import English -from spacy.vocab import Vocab - -from spacy.tests.util import make_tempdir - - -def test_pickle_ner(): - """ Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") - nlp = English(vocab=vocab) - ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) - with make_tempdir() as tmp_path: - with (tmp_path / "ner.pkl").open("wb") as file_: - pickle.dump(ner, file_) - assert ner.cfg["min_action_freq"] == 342 - - with (tmp_path / "ner.pkl").open("rb") as file_: - ner2 = pickle.load(file_) - assert ner2.cfg["min_action_freq"] == 342 - - -def test_issue4725(): - # ensures that this runs correctly and doesn't hang or crash because of the global vectors - # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) - vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") - data[0] = 1.0 - data[1] = 2.0 - vocab.set_vector("cat", data[0]) - vocab.set_vector("dog", data[1]) - - nlp = English(vocab=vocab) - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - nlp.begin_training() - docs = ["Kurt is in London."] * 10 - for _ in nlp.pipe(docs, batch_size=2, n_process=2): - pass diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py deleted file mode 100644 index ddbf6f7a0..000000000 --- a/spacy/tests/regression/test_issue4849.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - - -def test_issue4849(): - nlp = English() - - ruler = EntityRuler( - nlp, - patterns=[ - {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, - {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, - ], - phrase_matcher_attr="LOWER", - ) - - nlp.add_pipe(ruler) - - text = """ - The left is starting to take aim at Democratic front-runner Joe Biden. - Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." - """ - - # USING 1 PROCESS - count_ents = 0 - for doc in nlp.pipe([text], n_process=1): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - - # USING 2 PROCESSES - count_ents = 0 - for doc in nlp.pipe([text], n_process=2): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py deleted file mode 100644 index a3dff16aa..000000000 --- a/spacy/tests/regression/test_issue4903.py +++ /dev/null @@ -1,40 +0,0 @@ -from spacy.lang.en import English -from spacy.tokens import Span, Doc - - -class CustomPipe: - name = "my_pipe" - - def __init__(self): - Span.set_extension("my_ext", getter=self._get_my_ext) - Doc.set_extension("my_ext", default=None) - - def __call__(self, doc): - gathered_ext = [] - for sent in doc.sents: - sent_ext = self._get_my_ext(sent) - sent._.set("my_ext", sent_ext) - gathered_ext.append(sent_ext) - - doc._.set("my_ext", "\n".join(gathered_ext)) - - return doc - - @staticmethod - def _get_my_ext(span): - return str(span.end) - - -def test_issue4903(): - # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - - nlp = English() - custom_component = CustomPipe() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(custom_component, after="sentencizer") - - text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - docs = list(nlp.pipe(text, n_process=2)) - assert docs[0].text == "I like bananas." - assert docs[1].text == "Do you like them?" - assert docs[2].text == "No, I prefer wasabi." diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py deleted file mode 100644 index c3d3c4326..000000000 --- a/spacy/tests/regression/test_issue4924.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.gold import Example -from spacy.language import Language - - -def test_issue4924(): - nlp = Language() - example = Example.from_dict(nlp.make_doc(""), {}) - nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..3c1cee5c3 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,6 +1,8 @@ +import pytest from spacy.lang.en import English +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue5152(): # Test that the comparison between a Span and a Token, goes well # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) @@ -8,7 +10,6 @@ def test_issue5152(): text = nlp("Talk about being boring!") text_var = nlp("Talk of being boring!") y = nlp("Let") - span = text[0:3] # Talk about being span_2 = text[0:3] # Talk about being span_3 = text_var[0:3] # Talk of being diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9ffa3862c..86020bf17 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -63,7 +63,8 @@ def tagger(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - tagger.begin_training(pipeline=nlp.pipeline) + with pytest.warns(UserWarning): + tagger.begin_training(pipeline=nlp.pipeline) return tagger diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index cd354ff92..7d3033560 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,10 +1,11 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align +from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.gold.converters import json2docs from spacy.lang.en import English +from spacy.pipeline import EntityRuler from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch from thinc.api import compounding @@ -271,75 +272,76 @@ def test_split_sentences(en_vocab): assert split_examples[1].text == "had loads of fun " -@pytest.mark.xfail(reason="Alignment should be fixed after example refactor") def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): - words = ["I", "flew to", "San Francisco Valley", "."] - spaces = [True, True, False, False] + words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."] + spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + prefix = "Mr and Mrs Smith flew to " + entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] + gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() - assert ner_tags == ["O", "O", "U-LOC", "O"] + assert ner_tags == ["O", "O", "O", "U-LOC", "O"] entities = [ - (len("I "), len("I flew to"), "ORG"), - (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() - assert ner_tags == ["O", "U-ORG", "U-LOC", "O"] + assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] entities = [ - (len("I "), len("I flew"), "ORG"), - (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + (len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() - assert ner_tags == ["O", None, "U-LOC", "O"] + assert ner_tags == ["O", None, "O", "U-LOC", "O"] def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): - words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + spaces = [True, True, True, True, True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + prefix = "Mr and Mrs Smith flew to " + entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] + gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + + entities = [ + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + + +def test_gold_biluo_misaligned(en_vocab, en_tokenizer): + words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gold_words = ["I", "flew to", "San Francisco Valley", "."] + prefix = "Mr and Mrs Smith flew to " + entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] + gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() - assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] entities = [ - (len("I "), len("I flew to"), "ORG"), - (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["I", "flew to", "San Francisco Valley", "."] + gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() - assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"] - - -@pytest.mark.xfail(reason="Alignment should be fixed after example refactor") -def test_gold_biluo_misaligned(en_vocab, en_tokenizer): - words = ["I flew", "to", "San Francisco", "Valley", "."] - spaces = [True, True, True, False, False] - doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gold_words = ["I", "flew to", "San", "Francisco Valley", "."] - example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) - ner_tags = example.get_aligned_ner() - assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"] - - entities = [ - (len("I "), len("I flew to"), "ORG"), - (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), - ] - gold_words = ["I", "flew to", "San", "Francisco Valley", "."] - example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) - ner_tags = example.get_aligned_ner() - assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"] + assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): @@ -349,7 +351,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): "I flew to San Francisco Valley.", ) doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + prefix = "I flew to " + entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."] gold_spaces = [True, True, False, True, False, False] example = Example.from_dict( @@ -405,6 +408,49 @@ def test_biluo_spans(en_tokenizer): assert spans[1].label_ == "GPE" +def test_aligned_spans_y2x(en_vocab, en_tokenizer): + words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."] + spaces = [True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + prefix = "Mr and Mrs Smith flew to " + entities = [ + (0, len("Mr and Mrs Smith"), "PERSON"), + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) + ents_ref = example.reference.ents + assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)] + ents_y2x = example.get_aligned_spans_y2x(ents_ref) + assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)] + + +def test_aligned_spans_x2y(en_vocab, en_tokenizer): + text = "Mr and Mrs Smith flew to San Francisco Valley" + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"}, + {"label": "LOC", "pattern": "San Francisco Valley"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)] + prefix = "Mr and Mrs Smith flew to " + entities = [ + (0, len("Mr and Mrs Smith"), "PERSON"), + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"] + example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) + assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)] + + # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct + ents_pred = example.predicted.ents + assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)] + ents_x2y = example.get_aligned_spans_x2y(ents_pred) + assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)] + + def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] @@ -412,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer): assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] +def test_projectivize(en_tokenizer): + doc = en_tokenizer("He pretty quickly walks away") + heads = [3, 2, 3, 0, 2] + example = Example.from_dict(doc, {"heads": heads}) + proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) + nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) + assert proj_heads == [3, 2, 3, 0, 3] + assert nonproj_heads == [3, 2, 3, 0, 2] + + def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] @@ -514,6 +570,7 @@ def test_make_orth_variants(doc): make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) +@pytest.mark.skip("Outdated") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ @@ -537,12 +594,12 @@ def test_make_orth_variants(doc): ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), ], ) -def test_align(tokens_a, tokens_b, expected): - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) - assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected +def test_align(tokens_a, tokens_b, expected): # noqa + cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa + assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa # check symmetry - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) - assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected + cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa + assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa def test_goldparse_startswith_space(en_tokenizer): @@ -556,7 +613,7 @@ def test_goldparse_startswith_space(en_tokenizer): doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} ) ner_tags = example.get_aligned_ner() - assert ner_tags == [None, "U-DATE"] + assert ner_tags == ["O", "U-DATE"] assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 58eab4a54..f858b0759 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -55,7 +55,7 @@ def test_aligned_tags(): predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) aligned_tags = example.get_aligned("tag", as_string=True) - assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"] + assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] def test_aligned_tags_multi(): diff --git a/spacy/tests/test_projects.py b/spacy/tests/test_projects.py new file mode 100644 index 000000000..c3477f463 --- /dev/null +++ b/spacy/tests/test_projects.py @@ -0,0 +1,31 @@ +import pytest +from spacy.cli.project.util import validate_project_commands +from spacy.schemas import ProjectConfigSchema, validate + + +@pytest.mark.parametrize( + "config", + [ + {"commands": [{"name": "a"}, {"name": "a"}]}, + {"commands": [{"name": "a"}], "workflows": {"a": []}}, + {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}}, + ], +) +def test_project_config_validation1(config): + with pytest.raises(SystemExit): + validate_project_commands(config) + + +@pytest.mark.parametrize( + "config,n_errors", + [ + ({"commands": {"a": []}}, 1), + ({"commands": [{"help": "..."}]}, 1), + ({"commands": [{"name": "a", "extra": "b"}]}, 1), + ({"commands": [{"extra": "b"}]}, 2), + ({"commands": [{"name": "a", "deps": [123]}]}, 1), + ], +) +def test_project_config_validation2(config, n_errors): + errors = validate(ProjectConfigSchema, config) + assert len(errors) == n_errors diff --git a/spacy/util.py b/spacy/util.py index 4a17b7f24..66b88d2d8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]: return shlex.split(command, posix=not is_windows) +def join_command(command: List[str]) -> str: + """Join a command using shlex. shlex.join is only available for Python 3.8+, + so we're using a workaround here. + + command (List[str]): The command to join. + RETURNS (str): The joined command + """ + return " ".join(shlex.quote(cmd) for cmd in command) + + def run_command(command: Union[str, List[str]]) -> None: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str: return hashlib.md5(Path(path).read_bytes()).hexdigest() +def is_cwd(path: Union[Path, str]) -> bool: + """Check whether a path is the current working directory. + + path (Union[Path, str]): The directory path. + RETURNS (bool): Whether the path is the current working directory. + """ + return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower() + + def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer.