diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg index 48fe25a67..eab68a27f 100644 --- a/examples/experiments/onto-ner.cfg +++ b/examples/experiments/onto-ner.cfg @@ -5,16 +5,16 @@ # data is passed in sentence-by-sentence via some prior preprocessing. gold_preproc = false # Limitations on training document length or number of examples. -max_length = 5000 +max_length = 3000 limit = 0 # Data augmentation orth_variant_level = 0.0 -dropout = 0.2 +dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. -patience = 1600 +patience = 100000 max_epochs = 0 -max_steps = 20000 -eval_frequency = 500 +max_steps = 0 +eval_frequency = 1000 # Other settings seed = 0 accumulate_gradient = 1 @@ -26,6 +26,7 @@ score_weights = {"ents_f": 1.0} init_tok2vec = null discard_oversize = false omit_extra_lookups = false +batch_by = "words" [training.batch_size] @schedules = "compounding.v1" @@ -37,19 +38,13 @@ compound = 1.001 @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 -L2_is_weight_decay = false -L2 = 1e-6 +L2_is_weight_decay = true +L2 = 0.01 grad_clip = 1.0 use_averages = true eps = 1e-8 learn_rate = 0.001 -#[optimizer.learn_rate] -#@schedules = "warmup_linear.v1" -#warmup_steps = 250 -#total_steps = 20000 -#initial_rate = 0.001 - [nlp] lang = "en" vectors = null @@ -58,8 +53,6 @@ vectors = null factory = "ner" learn_tokens = false min_action_freq = 1 -beam_width = 1 -beam_update_prob = 1.0 [nlp.pipeline.ner.model] @architectures = "spacy.TransitionBasedParser.v1" diff --git a/spacy/about.py b/spacy/about.py index 057e21c87..008412359 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,8 +1,7 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a2" +__version__ = "3.0.0a4" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" -__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json" __projects__ = "https://github.com/explosion/spacy-boilerplates" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5dc3070b6..0b92f8bf4 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -11,12 +11,15 @@ from .profile import profile # noqa: F401 from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 +from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_clone, project_assets, project_run # noqa: F401 -from .project import project_run_all # noqa: F401 +from .project.clone import project_clone # noqa: F401 +from .project.assets import project_assets # noqa: F401 +from .project.run import project_run # noqa: F401 +from .project.dvc import project_update_dvc # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index 2b3ad9524..e970c4dde 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface DOCS: https://spacy.io/api/cli """ +PROJECT_HELP = f"""Command-line interface for spaCy projects and working with +project templates. You'd typically start by cloning a project template to a local +directory and fetching its assets like datasets etc. See the project's +project.yml for the available commands. +""" app = typer.Typer(name=NAME, help=HELP) +project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) +app.add_typer(project_cli) # Wrappers for Typer's annotations. Initially created to set defaults and to # keep the names short, but not needed at the moment. diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index c8c5a3902..c26b5ee75 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -120,8 +120,12 @@ def convert( no_print=silent, ner_map=ner_map, ) + if file_type == "json": + data = [docs_to_json(docs)] + else: + data = DocBin(docs=docs, store_user_data=True).to_bytes() if output_dir == "-": - _print_docs_to_stdout(docs, file_type) + _print_docs_to_stdout(data, file_type) else: if input_loc != input_path: subpath = input_loc.relative_to(input_path) @@ -129,24 +133,23 @@ def convert( else: output_file = Path(output_dir) / input_loc.parts[-1] output_file = output_file.with_suffix(f".{file_type}") - _write_docs_to_file(docs, output_file, file_type) + _write_docs_to_file(data, output_file, file_type) msg.good(f"Generated output file ({len(docs)} documents): {output_file}") -def _print_docs_to_stdout(docs, output_type): +def _print_docs_to_stdout(data, output_type): if output_type == "json": - srsly.write_json("-", [docs_to_json(docs)]) + srsly.write_json("-", data) else: - sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) + sys.stdout.buffer.write(data) -def _write_docs_to_file(docs, output_file, output_type): +def _write_docs_to_file(data, output_file, output_type): if not output_file.parent.exists(): output_file.parent.mkdir(parents=True) if output_type == "json": - srsly.write_json(output_file, [docs_to_json(docs)]) + srsly.write_json(output_file, data) else: - data = DocBin(docs=docs, store_user_data=True).to_bytes() with output_file.open("wb") as file_: file_.write(data) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py new file mode 100644 index 000000000..54c71f824 --- /dev/null +++ b/spacy/cli/debug_model.py @@ -0,0 +1,168 @@ +from typing import List +from pathlib import Path +from wasabi import msg + +from ._app import app, Arg, Opt +from .. import util +from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam +from ..lang.en import English + + +@app.command("debug-model") +def debug_model_cli( + # fmt: off + config_path: Path = Arg(..., help="Path to config file", exists=True), + layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"), + dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), + parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), + gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), + attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"), + P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), + P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), + P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), + P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"), + use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), + seed: int = Opt(None, "--seed", "-s", help="Use GPU"), + # fmt: on +): + """ + Analyze a Thinc ML model - internal structure and activations during training + """ + print_settings = { + "dimensions": dimensions, + "parameters": parameters, + "gradients": gradients, + "attributes": attributes, + "layers": [int(x.strip()) for x in layers.split(",")] if layers else [], + "print_before_training": P0, + "print_after_init": P1, + "print_after_training": P2, + "print_prediction": P3, + } + + if seed is not None: + msg.info(f"Fixing random seed: {seed}") + fix_random_seed(seed) + if use_gpu >= 0: + msg.info(f"Using GPU: {use_gpu}") + require_gpu(use_gpu) + else: + msg.info(f"Using CPU") + + debug_model( + config_path, + print_settings=print_settings, + ) + + +def debug_model( + config_path: Path, + *, + print_settings=None +): + if print_settings is None: + print_settings = {} + + model = util.load_config(config_path, create_objects=True)["model"] + + # STEP 0: Printing before training + msg.info(f"Analysing model with ID {model.id}") + if print_settings.get("print_before_training"): + msg.info(f"Before training:") + _print_model(model, print_settings) + + # STEP 1: Initializing the model and printing again + model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp)) + if print_settings.get("print_after_init"): + msg.info(f"After initialization:") + _print_model(model, print_settings) + + # STEP 2: Updating the model and printing again + optimizer = Adam(0.001) + set_dropout_rate(model, 0.2) + for e in range(3): + Y, get_dX = model.begin_update(_get_docs()) + dY = get_gradient(model, Y) + _ = get_dX(dY) + model.finish_update(optimizer) + if print_settings.get("print_after_training"): + msg.info(f"After training:") + _print_model(model, print_settings) + + # STEP 3: the final prediction + prediction = model.predict(_get_docs()) + if print_settings.get("print_prediction"): + msg.info(f"Prediction:", str(prediction)) + + +def get_gradient(model, Y): + goldY = _get_output(model.ops.xp) + return Y - goldY + + +def _sentences(): + return [ + "Apple is looking at buying U.K. startup for $1 billion", + "Autonomous cars shift insurance liability toward manufacturers", + "San Francisco considers banning sidewalk delivery robots", + "London is a big city in the United Kingdom.", + ] + + +def _get_docs(): + nlp = English() + return list(nlp.pipe(_sentences())) + + +def _get_output(xp): + return xp.asarray([xp.asarray([i+10, i+20, i+30], dtype="float32") for i, _ in enumerate(_get_docs())]) + + +def _print_model(model, print_settings): + layers = print_settings.get("layers", "") + parameters = print_settings.get("parameters", False) + dimensions = print_settings.get("dimensions", False) + gradients = print_settings.get("gradients", False) + attributes = print_settings.get("attributes", False) + + for i, node in enumerate(model.walk()): + if not layers or i in layers: + msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'") + + if dimensions: + for name in node.dim_names: + if node.has_dim(name): + msg.info(f" - dim {name}: {node.get_dim(name)}") + else: + msg.info(f" - dim {name}: {node.has_dim(name)}") + + if parameters: + for name in node.param_names: + if node.has_param(name): + print_value = _print_matrix(node.get_param(name)) + msg.info(f" - param {name}: {print_value}") + else: + msg.info(f" - param {name}: {node.has_param(name)}") + if gradients: + for name in node.param_names: + if node.has_grad(name): + print_value = _print_matrix(node.get_grad(name)) + msg.info(f" - grad {name}: {print_value}") + else: + msg.info(f" - grad {name}: {node.has_grad(name)}") + if attributes: + attrs = node.attrs + for name, value in attrs.items(): + msg.info(f" - attr {name}: {value}") + + +def _print_matrix(value): + if value is None or isinstance(value, bool): + return value + result = str(value.shape) + " - sample: " + sample_matrix = value + for d in range(value.ndim-1): + sample_matrix = sample_matrix[0] + sample_matrix = sample_matrix[0:5] + result = result + str(sample_matrix) + return result diff --git a/spacy/cli/download.py b/spacy/cli/download.py index ea5e7a890..f192cb196 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,4 +1,4 @@ -from typing import Optional, Sequence, Union +from typing import Optional, Sequence import requests import sys from wasabi import msg @@ -8,6 +8,23 @@ from ._app import app, Arg, Opt from .. import about from ..util import is_package, get_base_version, run_command +# These are the old shortcuts we previously supported in spacy download. As of +# v3, shortcuts are deprecated so we're not expecting to add anything to this +# list. It only exists to show users warnings. +OLD_SHORTCUTS = { + "en": "en_core_web_sm", + "de": "de_core_news_sm", + "es": "es_core_news_sm", + "pt": "pt_core_news_sm", + "fr": "fr_core_news_sm", + "it": "it_core_news_sm", + "nl": "nl_core_news_sm", + "el": "el_core_news_sm", + "nb": "nb_core_news_sm", + "lt": "lt_core_news_sm", + "xx": "xx_ent_wiki_sm", +} + @app.command( "download", @@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None: version = components[-1] download_model(dl_tpl.format(m=model_name, v=version), pip_args) else: - shortcuts = get_json(about.__shortcuts__, "available shortcuts") - model_name = shortcuts.get(model, model) + model_name = model + if model in OLD_SHORTCUTS: + msg.warn( + f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. " + f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead." + ) + model_name = OLD_SHORTCUTS[model] compatibility = get_compatibility() version = get_version(model_name, compatibility) download_model(dl_tpl.format(m=model_name, v=version), pip_args) @@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None: ) -def get_json(url: str, desc: str) -> Union[dict, list]: - r = requests.get(url) +def get_compatibility() -> dict: + version = get_base_version(about.__version__) + r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( f"Server error ({r.status_code})", - f"Couldn't fetch {desc}. Please find a model for your spaCy " + f"Couldn't fetch compatibility table. Please find a model for your spaCy " f"installation (v{about.__version__}), and download it manually. " f"For more details, see the documentation: " f"https://spacy.io/usage/models", exits=1, ) - return r.json() - - -def get_compatibility() -> dict: - version = get_base_version(about.__version__) - comp_table = get_json(about.__compatibility__, "compatibility table") + comp_table = r.json() comp = comp_table["spacy"] if version not in comp: msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) diff --git a/spacy/cli/project.py b/spacy/cli/project.py deleted file mode 100644 index 200471127..000000000 --- a/spacy/cli/project.py +++ /dev/null @@ -1,708 +0,0 @@ -from typing import List, Dict, Any, Optional, Sequence -import typer -import srsly -from pathlib import Path -from wasabi import msg -import subprocess -import os -import re -import shutil -import sys -import requests -import tqdm - -from ._app import app, Arg, Opt, COMMAND, NAME -from .. import about -from ..schemas import ProjectConfigSchema, validate -from ..util import ensure_path, run_command, make_tempdir, working_dir -from ..util import get_hash, get_checksum, split_command - - -CONFIG_FILE = "project.yml" -DVC_CONFIG = "dvc.yaml" -DVC_DIR = ".dvc" -DIRS = [ - "assets", - "metas", - "configs", - "packages", - "metrics", - "scripts", - "notebooks", - "training", - "corpus", -] -CACHES = [ - Path.home() / ".torch", - Path.home() / ".caches" / "torch", - os.environ.get("TORCH_HOME"), - Path.home() / ".keras", -] -DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit -# it directly and edit the project.yml instead and re-run the project.""" -CLI_HELP = f"""Command-line interface for spaCy projects and working with project -templates. You'd typically start by cloning a project template to a local -directory and fetching its assets like datasets etc. See the project's -{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data -Version Control) to manage input and output files and to ensure steps are only -re-run if their inputs change. -""" - -project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True) - - -@project_cli.callback(invoke_without_command=True) -def callback(ctx: typer.Context): - """This runs before every project command and ensures DVC is installed.""" - ensure_dvc() - - -################ -# CLI COMMANDS # -################ - - -@project_cli.command("clone") -def project_clone_cli( - # fmt: off - name: str = Arg(..., help="The name of the template to fetch"), - dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), - repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), - git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), - no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), - # fmt: on -): - """Clone a project template from a repository. Calls into "git" and will - only download the files from the given subdirectory. The GitHub repo - defaults to the official spaCy template repo, but can be customized - (including using a private repo). Setting the --git flag will also - initialize the project directory as a Git repo. If the project is intended - to be a Git repo, it should be initialized with Git first, before - initializing DVC (Data Version Control). This allows DVC to integrate with - Git. - """ - if dest == Path.cwd(): - dest = dest / name - project_clone(name, dest, repo=repo, git=git, no_init=no_init) - - -@project_cli.command("init") -def project_init_cli( - # fmt: off - path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), - force: bool = Opt(False, "--force", "-F", help="Force initiziation"), - # fmt: on -): - """Initialize a project directory with DVC and optionally Git. This should - typically be taken care of automatically when you run the "project clone" - command, but you can also run it separately. If the project is intended to - be a Git repo, it should be initialized with Git first, before initializing - DVC. This allows DVC to integrate with Git. - """ - project_init(path, git=git, force=force, silent=True) - - -@project_cli.command("assets") -def project_assets_cli( - # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - # fmt: on -): - """Use DVC (Data Version Control) to fetch project assets. Assets are - defined in the "assets" section of the project config. If possible, DVC - will try to track the files so you can pull changes from upstream. It will - also try and store the checksum so the assets are versioned. If the file - can't be tracked or checked, it will be downloaded without DVC. If a checksum - is provided in the project config, the file is only downloaded if no local - file with the same checksum exists. - """ - project_assets(project_dir) - - -@project_cli.command( - "run-all", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_run_all_cli( - # fmt: off - ctx: typer.Context, - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run all commands defined in the project. This command will use DVC and - the defined outputs and dependencies in the project config to determine - which steps need to be re-run and where to start. This means you're only - re-generating data if the inputs have changed. - - This command calls into "dvc repro" and all additional arguments are passed - to the "dvc repro" command: https://dvc.org/doc/command-reference/repro - """ - if show_help: - print_run_help(project_dir) - else: - project_run_all(project_dir, *ctx.args) - - -@project_cli.command( - "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_run_cli( - # fmt: off - ctx: typer.Context, - subcommand: str = Arg(None, help="Name of command defined in project config"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run a named script defined in the project config. If the command is - part of the default pipeline defined in the "run" section, DVC is used to - determine whether the step should re-run if its inputs have changed, or - whether everything is up to date. If the script is not part of the default - pipeline, it will be called separately without DVC. - - If DVC is used, the command calls into "dvc repro" and all additional - arguments are passed to the "dvc repro" command: - https://dvc.org/doc/command-reference/repro - """ - if show_help or not subcommand: - print_run_help(project_dir, subcommand) - else: - project_run(project_dir, subcommand, *ctx.args) - - -@project_cli.command("exec", hidden=True) -def project_exec_cli( - # fmt: off - subcommand: str = Arg(..., help="Name of command defined in project config"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - # fmt: on -): - """Execute a command defined in the project config. This CLI command is - only called internally in auto-generated DVC pipelines, as a shortcut for - multi-step commands in the project config. You typically shouldn't have to - call it yourself. To run a command, call "run" or "run-all". - """ - project_exec(project_dir, subcommand) - - -@project_cli.command("update-dvc") -def project_update_dvc_cli( - # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), - force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), - # fmt: on -): - """Update the auto-generated DVC config file. Uses the steps defined in the - "run" section of the project config. This typically happens automatically - when running a command, but can also be triggered manually if needed. - """ - config = load_project_config(project_dir) - updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) - if updated: - msg.good(f"Updated DVC config from {CONFIG_FILE}") - else: - msg.info(f"No changes found in {CONFIG_FILE}, no update needed") - - -app.add_typer(project_cli, name="project") - - -################# -# CLI FUNCTIONS # -################# - - -def project_clone( - name: str, - dest: Path, - *, - repo: str = about.__projects__, - git: bool = False, - no_init: bool = False, -) -> None: - """Clone a project template from a repository. - - name (str): Name of subdirectory to clone. - dest (Path): Destination path of cloned project. - repo (str): URL of Git repo containing project templates. - git (bool): Initialize project as Git repo. Should be set to True if project - is intended as a repo, since it will allow DVC to integrate with Git. - no_init (bool): Don't initialize DVC and Git automatically. If True, the - "init" command or "git init" and "dvc init" need to be run manually. - """ - dest = ensure_path(dest) - check_clone(name, dest, repo) - project_dir = dest.resolve() - # We're using Git and sparse checkout to only clone the files we need - with make_tempdir() as tmp_dir: - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" - try: - run_command(cmd) - except SystemExit: - err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." - msg.fail(err) - with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: - f.write(name) - try: - run_command(["git", "-C", str(tmp_dir), "fetch"]) - run_command(["git", "-C", str(tmp_dir), "checkout"]) - except SystemExit: - err = f"Could not clone '{name}' in the repo '{repo}'." - msg.fail(err) - shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) - msg.good(f"Cloned project '{name}' from {repo} into {project_dir}") - for sub_dir in DIRS: - dir_path = project_dir / sub_dir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - if not no_init: - project_init(project_dir, git=git, force=True, silent=True) - msg.good(f"Your project is now ready!", dest) - print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") - - -def project_init( - project_dir: Path, - *, - git: bool = False, - force: bool = False, - silent: bool = False, - analytics: bool = False, -): - """Initialize a project as a DVC and (optionally) as a Git repo. - - project_dir (Path): Path to project directory. - git (bool): Also call "git init" to initialize directory as a Git repo. - silent (bool): Don't print any output (via DVC). - analytics (bool): Opt-in to DVC analytics (defaults to False). - """ - with working_dir(project_dir) as cwd: - if git: - run_command(["git", "init"]) - init_cmd = ["dvc", "init"] - if silent: - init_cmd.append("--quiet") - if not git: - init_cmd.append("--no-scm") - if force: - init_cmd.append("--force") - run_command(init_cmd) - # We don't want to have analytics on by default – our users should - # opt-in explicitly. If they want it, they can always enable it. - if not analytics: - run_command(["dvc", "config", "core.analytics", "false"]) - # Remove unused and confusing plot templates from .dvc directory - # TODO: maybe we shouldn't do this, but it's otherwise super confusing - # once you commit your changes via Git and it creates a bunch of files - # that have no purpose - plots_dir = cwd / DVC_DIR / "plots" - if plots_dir.exists(): - shutil.rmtree(str(plots_dir)) - config = load_project_config(cwd) - setup_check_dvc(cwd, config) - - -def project_assets(project_dir: Path) -> None: - """Fetch assets for a project using DVC if possible. - - project_dir (Path): Path to project directory. - """ - project_path = ensure_path(project_dir) - config = load_project_config(project_path) - setup_check_dvc(project_path, config) - assets = config.get("assets", {}) - if not assets: - msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) - msg.info(f"Fetching {len(assets)} asset(s)") - variables = config.get("variables", {}) - fetched_assets = [] - for asset in assets: - url = asset["url"].format(**variables) - dest = asset["dest"].format(**variables) - fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) - if fetched_path: - fetched_assets.append(str(fetched_path)) - if fetched_assets: - with working_dir(project_path): - run_command(["dvc", "add", *fetched_assets, "--external"]) - - -def fetch_asset( - project_path: Path, url: str, dest: Path, checksum: Optional[str] = None -) -> Optional[Path]: - """Fetch an asset from a given URL or path. Will try to import the file - using DVC's import-url if possible (fully tracked and versioned) and falls - back to get-url (versioned) and a non-DVC download if necessary. If a - checksum is provided and a local file exists, it's only re-downloaded if the - checksum doesn't match. - - project_path (Path): Path to project directory. - url (str): URL or path to asset. - checksum (Optional[str]): Optional expected checksum of local file. - RETURNS (Optional[Path]): The path to the fetched asset or None if fetching - the asset failed. - """ - url = convert_asset_url(url) - dest_path = (project_path / dest).resolve() - if dest_path.exists() and checksum: - # If there's already a file, check for checksum - # TODO: add support for caches (dvc import-url with local path) - if checksum == get_checksum(dest_path): - msg.good(f"Skipping download with matching checksum: {dest}") - return dest_path - with working_dir(project_path): - try: - # If these fail, we don't want to output an error or info message. - # Try with tracking the source first, then just downloading with - # DVC, then a regular non-DVC download. - try: - dvc_cmd = ["dvc", "import-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: - dvc_cmd = ["dvc", "get-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: - try: - download_file(url, dest_path) - except requests.exceptions.HTTPError as e: - msg.fail(f"Download failed: {dest}", e) - return None - if checksum and checksum != get_checksum(dest_path): - msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") - msg.good(f"Fetched asset {dest}") - return dest_path - - -def project_run_all(project_dir: Path, *dvc_args) -> None: - """Run all commands defined in the project using DVC. - - project_dir (Path): Path to project directory. - *dvc_args: Other arguments passed to "dvc repro". - """ - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) - dvc_cmd = ["dvc", "repro", *dvc_args] - with working_dir(project_dir): - run_command(dvc_cmd) - - -def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: - """Simulate a CLI help prompt using the info available in the project config. - - project_dir (Path): The project directory. - subcommand (Optional[str]): The subcommand or None. If a subcommand is - provided, the subcommand help is shown. Otherwise, the top-level help - and a list of available commands is printed. - """ - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) - config_commands = config.get("commands", []) - commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand: - validate_subcommand(commands.keys(), subcommand) - print(f"Usage: {COMMAND} project run {subcommand} {project_dir}") - help_text = commands[subcommand].get("help") - if help_text: - msg.text(f"\n{help_text}\n") - else: - print(f"\nAvailable commands in {CONFIG_FILE}") - print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - msg.text("Run all commands defined in the 'run' block of the project config:") - print(f"{COMMAND} project run-all {project_dir}") - - -def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: - """Run a named script defined in the project config. If the script is part - of the default pipeline (defined in the "run" section), DVC is used to - execute the command, so it can determine whether to rerun it. It then - calls into "exec" to execute it. - - project_dir (Path): Path to project directory. - subcommand (str): Name of command to run. - *dvc_args: Other arguments passed to "dvc repro". - """ - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) - config_commands = config.get("commands", []) - variables = config.get("variables", {}) - commands = {cmd["name"]: cmd for cmd in config_commands} - validate_subcommand(commands.keys(), subcommand) - if subcommand in config.get("run", []): - # This is one of the pipeline commands tracked in DVC - dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] - with working_dir(project_dir): - run_command(dvc_cmd) - else: - cmd = commands[subcommand] - # Deps in non-DVC commands aren't tracked, but if they're defined, - # make sure they exist before running the command - for dep in cmd.get("deps", []): - if not (project_dir / dep).exists(): - err = f"Missing dependency specified by command '{subcommand}': {dep}" - msg.fail(err, exits=1) - with working_dir(project_dir): - run_commands(cmd["script"], variables) - - -def project_exec(project_dir: Path, subcommand: str): - """Execute a command defined in the project config. - - project_dir (Path): Path to project directory. - subcommand (str): Name of command to run. - """ - config = load_project_config(project_dir) - config_commands = config.get("commands", []) - variables = config.get("variables", {}) - commands = {cmd["name"]: cmd for cmd in config_commands} - with working_dir(project_dir): - run_commands(commands[subcommand]["script"], variables) - - -########### -# HELPERS # -########### - - -def load_project_config(path: Path) -> Dict[str, Any]: - """Load the project config file from a directory and validate it. - - path (Path): The path to the project directory. - RETURNS (Dict[str, Any]): The loaded project config. - """ - config_path = path / CONFIG_FILE - if not config_path.exists(): - msg.fail("Can't find project config", config_path, exits=1) - invalid_err = f"Invalid project config in {CONFIG_FILE}" - try: - config = srsly.read_yaml(config_path) - except ValueError as e: - msg.fail(invalid_err, e, exits=1) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(invalid_err, "\n".join(errors), exits=1) - return config - - -def update_dvc_config( - path: Path, - config: Dict[str, Any], - verbose: bool = False, - silent: bool = False, - force: bool = False, -) -> bool: - """Re-run the DVC commands in dry mode and update dvc.yaml file in the - project directory. The file is auto-generated based on the config. The - first line of the auto-generated file specifies the hash of the config - dict, so if any of the config values change, the DVC config is regenerated. - - path (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project config. - verbose (bool): Whether to print additional info (via DVC). - silent (bool): Don't output anything (via DVC). - force (bool): Force update, even if hashes match. - RETURNS (bool): Whether the DVC config file was updated. - """ - config_hash = get_hash(config) - path = path.resolve() - dvc_config_path = path / DVC_CONFIG - if dvc_config_path.exists(): - # Check if the file was generated using the current config, if not, redo - with dvc_config_path.open("r", encoding="utf8") as f: - ref_hash = f.readline().strip().replace("# ", "") - if ref_hash == config_hash and not force: - return False # Nothing has changed in project config, don't need to update - dvc_config_path.unlink() - variables = config.get("variables", {}) - commands = [] - # We only want to include commands that are part of the main list of "run" - # commands in project.yml and should be run in sequence - config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} - for name in config.get("run", []): - validate_subcommand(config_commands.keys(), name) - command = config_commands[name] - deps = command.get("deps", []) - outputs = command.get("outputs", []) - outputs_no_cache = command.get("outputs_no_cache", []) - if not deps and not outputs and not outputs_no_cache: - continue - # Default to the working dir as the project path since dvc.yaml is auto-generated - # and we don't want arbitrary paths in there - project_cmd = ["python", "-m", NAME, "project", "exec", name] - deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] - outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] - outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] - dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] - if verbose: - dvc_cmd.append("--verbose") - if silent: - dvc_cmd.append("--quiet") - full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] - commands.append(" ".join(full_cmd)) - with working_dir(path): - run_commands(commands, variables, silent=True) - with dvc_config_path.open("r+", encoding="utf8") as f: - content = f.read() - f.seek(0, 0) - f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") - return True - - -def ensure_dvc() -> None: - """Ensure that the "dvc" command is available and show an error if not.""" - try: - subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - "spaCy projects require DVC (Data Version Control) and the 'dvc' command", - "You can install the Python package from pip (pip install dvc) or " - "conda (conda install -c conda-forge dvc). For more details, see the " - "documentation: https://dvc.org/doc/install", - exits=1, - ) - - -def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: - """Check that the project is set up correctly with DVC and update its - config if needed. Will raise an error if the project is not an initialized - DVC project. - - project_dir (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project config. - """ - if not project_dir.exists(): - msg.fail(f"Can't find project directory: {project_dir}") - if not (project_dir / ".dvc").exists(): - msg.fail( - "Project not initialized as a DVC project.", - f"Make sure that the project template was cloned correctly. To " - f"initialize the project directory manually, you can run: " - f"{COMMAND} project init {project_dir}", - exits=1, - ) - with msg.loading("Updating DVC config..."): - updated = update_dvc_config(project_dir, config, silent=True) - if updated: - msg.good(f"Updated DVC config from changed {CONFIG_FILE}") - - -def run_commands( - commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False -) -> None: - """Run a sequence of commands in a subprocess, in order. - - commands (List[str]): The string commands. - variables (Dict[str, str]): Dictionary of variable names, mapped to their - values. Will be used to substitute format string variables in the - commands. - silent (bool): Don't print the commands. - """ - for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) - command = split_command(command) - # Not sure if this is needed or a good idea. Motivation: users may often - # use commands in their config that reference "python" and we want to - # make sure that it's always executing the same Python that spaCy is - # executed with and the pip in the same env, not some other Python/pip. - # Also ensures cross-compatibility if user 1 writes "python3" (because - # that's how it's set up on their system), and user 2 without the - # shortcut tries to re-run the command. - if len(command) and command[0] in ("python", "python3"): - command[0] = sys.executable - elif len(command) and command[0] in ("pip", "pip3"): - command = [sys.executable, "-m", "pip", *command[1:]] - if not silent: - print(f"Running command: {' '.join(command)}") - run_command(command) - - -def convert_asset_url(url: str) -> str: - """Check and convert the asset URL if needed. - - url (str): The asset URL. - RETURNS (str): The converted URL. - """ - # If the asset URL is a regular GitHub URL it's likely a mistake - if re.match("(http(s?)):\/\/github.com", url): - converted = url.replace("github.com", "raw.githubusercontent.com") - converted = re.sub(r"/(tree|blob)/", "/", converted) - msg.warn( - "Downloading from a regular GitHub URL. This will only download " - "the source of the page, not the actual file. Converting the URL " - "to a raw URL.", - converted, - ) - return converted - return url - - -def check_clone(name: str, dest: Path, repo: str) -> None: - """Check and validate that the destination path can be used to clone. Will - check that Git is available and that the destination path is suitable. - - name (str): Name of the directory to clone from the repo. - dest (Path): Local destination of cloned directory. - repo (str): URL of the repo to clone from. - """ - try: - subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - f"Cloning spaCy project templates requires Git and the 'git' command. ", - f"To clone a project without Git, copy the files from the '{name}' " - f"directory in the {repo} to {dest} manually and then run:", - f"{COMMAND} project init {dest}", - exits=1, - ) - if not dest: - msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) - if dest.exists(): - # Directory already exists (not allowed, clone needs to create it) - msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) - if not dest.parent.exists(): - # We're not creating parents, parent dir should exist - msg.fail( - f"Can't clone project, parent directory doesn't exist: {dest.parent}", - exits=1, - ) - - -def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: - """Check that a subcommand is valid and defined. Raises an error otherwise. - - commands (Sequence[str]): The available commands. - subcommand (str): The subcommand. - """ - if subcommand not in commands: - msg.fail( - f"Can't find command '{subcommand}' in {CONFIG_FILE}. " - f"Available commands: {', '.join(commands)}", - exits=1, - ) - - -def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: - """Download a file using requests. - - url (str): The URL of the file. - dest (Path): The destination path. - chunk_size (int): The size of chunks to read/write. - """ - response = requests.get(url, stream=True) - response.raise_for_status() - total = int(response.headers.get("content-length", 0)) - progress_settings = { - "total": total, - "unit": "iB", - "unit_scale": True, - "unit_divisor": chunk_size, - "leave": False, - } - with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: - for data in response.iter_content(chunk_size=chunk_size): - size = f.write(data) - bar.update(size) diff --git a/spacy/cli/project/__init__.py b/spacy/cli/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py new file mode 100644 index 000000000..2270574ab --- /dev/null +++ b/spacy/cli/project/assets.py @@ -0,0 +1,158 @@ +from typing import Optional +from pathlib import Path +from wasabi import msg +import requests +import tqdm +import re +import shutil + +from ...util import ensure_path, working_dir +from .._app import project_cli, Arg +from .util import PROJECT_FILE, load_project_config, get_checksum + + +# TODO: find a solution for caches +# CACHES = [ +# Path.home() / ".torch", +# Path.home() / ".caches" / "torch", +# os.environ.get("TORCH_HOME"), +# Path.home() / ".keras", +# ] + + +@project_cli.command("assets") +def project_assets_cli( + # fmt: off + project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), + # fmt: on +): + """Fetch project assets like datasets and pretrained weights. Assets are + defined in the "assets" section of the project.yml. If a checksum is + provided in the project.yml, the file is only downloaded if no local file + with the same checksum exists. + """ + project_assets(project_dir) + + +def project_assets(project_dir: Path) -> None: + """Fetch assets for a project using DVC if possible. + + project_dir (Path): Path to project directory. + """ + project_path = ensure_path(project_dir) + config = load_project_config(project_path) + assets = config.get("assets", {}) + if not assets: + msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) + msg.info(f"Fetching {len(assets)} asset(s)") + variables = config.get("variables", {}) + for asset in assets: + dest = asset["dest"].format(**variables) + url = asset.get("url") + checksum = asset.get("checksum") + if not url: + # project.yml defines asset without URL that the user has to place + check_private_asset(dest, checksum) + continue + url = url.format(**variables) + fetch_asset(project_path, url, dest, checksum) + + +def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: + """Check and validate assets without a URL (private assets that the user + has to provide themselves) and give feedback about the checksum. + + dest (Path): Desintation path of the asset. + checksum (Optional[str]): Optional checksum of the expected file. + """ + if not Path(dest).exists(): + err = f"No URL provided for asset. You need to add this file yourself: {dest}" + msg.warn(err) + else: + if checksum and checksum == get_checksum(dest): + msg.good(f"Asset exists with matching checksum: {dest}") + else: + msg.fail(f"Asset available but with incorrect checksum: {dest}") + + +def fetch_asset( + project_path: Path, url: str, dest: Path, checksum: Optional[str] = None +) -> None: + """Fetch an asset from a given URL or path. If a checksum is provided and a + local file exists, it's only re-downloaded if the checksum doesn't match. + + project_path (Path): Path to project directory. + url (str): URL or path to asset. + checksum (Optional[str]): Optional expected checksum of local file. + RETURNS (Optional[Path]): The path to the fetched asset or None if fetching + the asset failed. + """ + # TODO: add support for caches + dest_path = (project_path / dest).resolve() + if dest_path.exists() and checksum: + # If there's already a file, check for checksum + if checksum == get_checksum(dest_path): + msg.good(f"Skipping download with matching checksum: {dest}") + return dest_path + # We might as well support the user here and create parent directories in + # case the asset dir isn't listed as a dir to create in the project.yml + if not dest_path.parent.exists(): + dest_path.parent.mkdir(parents=True) + with working_dir(project_path): + url = convert_asset_url(url) + try: + download_file(url, dest_path) + msg.good(f"Downloaded asset {dest}") + except requests.exceptions.RequestException as e: + if Path(url).exists() and Path(url).is_file(): + # If it's a local file, copy to destination + shutil.copy(url, str(dest_path)) + msg.good(f"Copied local asset {dest}") + else: + msg.fail(f"Download failed: {dest}", e) + return + if checksum and checksum != get_checksum(dest_path): + msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") + + +def convert_asset_url(url: str) -> str: + """Check and convert the asset URL if needed. + + url (str): The asset URL. + RETURNS (str): The converted URL. + """ + # If the asset URL is a regular GitHub URL it's likely a mistake + if re.match(r"(http(s?)):\/\/github.com", url): + converted = url.replace("github.com", "raw.githubusercontent.com") + converted = re.sub(r"/(tree|blob)/", "/", converted) + msg.warn( + "Downloading from a regular GitHub URL. This will only download " + "the source of the page, not the actual file. Converting the URL " + "to a raw URL.", + converted, + ) + return converted + return url + + +def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: + """Download a file using requests. + + url (str): The URL of the file. + dest (Path): The destination path. + chunk_size (int): The size of chunks to read/write. + """ + response = requests.get(url, stream=True) + response.raise_for_status() + total = int(response.headers.get("content-length", 0)) + progress_settings = { + "total": total, + "unit": "iB", + "unit_scale": True, + "unit_divisor": chunk_size, + "leave": False, + } + with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: + for data in response.iter_content(chunk_size=chunk_size): + size = f.write(data) + bar.update(size) diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py new file mode 100644 index 000000000..6ce2d309e --- /dev/null +++ b/spacy/cli/project/clone.py @@ -0,0 +1,97 @@ +from typing import Optional +from pathlib import Path +from wasabi import msg +import subprocess +import shutil +import re + +from ... import about +from ...util import ensure_path, run_command, make_tempdir +from .._app import project_cli, Arg, Opt, COMMAND +from .util import PROJECT_FILE + + +@project_cli.command("clone") +def project_clone_cli( + # fmt: off + name: str = Arg(..., help="The name of the template to clone"), + dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), + repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), + # fmt: on +): + """Clone a project template from a repository. Calls into "git" and will + only download the files from the given subdirectory. The GitHub repo + defaults to the official spaCy template repo, but can be customized + (including using a private repo). + """ + if dest is None: + dest = Path.cwd() / name + project_clone(name, dest, repo=repo) + + +def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None: + """Clone a project template from a repository. + + name (str): Name of subdirectory to clone. + dest (Path): Destination path of cloned project. + repo (str): URL of Git repo containing project templates. + """ + dest = ensure_path(dest) + check_clone(name, dest, repo) + project_dir = dest.resolve() + repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) + # We're using Git and sparse checkout to only clone the files we need + with make_tempdir() as tmp_dir: + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" + try: + run_command(cmd) + except subprocess.CalledProcessError: + err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." + msg.fail(err) + with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: + f.write(name) + try: + run_command(["git", "-C", str(tmp_dir), "fetch"]) + run_command(["git", "-C", str(tmp_dir), "checkout"]) + except subprocess.CalledProcessError: + err = f"Could not clone '{name}' from repo '{repo_name}'" + msg.fail(err) + # We need Path(name) to make sure we also support subdirectories + shutil.move(str(tmp_dir / Path(name)), str(project_dir)) + msg.good(f"Cloned '{name}' from {repo_name}", project_dir) + if not (project_dir / PROJECT_FILE).exists(): + msg.warn(f"No {PROJECT_FILE} found in directory") + else: + msg.good(f"Your project is now ready!") + print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") + + +def check_clone(name: str, dest: Path, repo: str) -> None: + """Check and validate that the destination path can be used to clone. Will + check that Git is available and that the destination path is suitable. + + name (str): Name of the directory to clone from the repo. + dest (Path): Local destination of cloned directory. + repo (str): URL of the repo to clone from. + """ + try: + subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + f"Cloning spaCy project templates requires Git and the 'git' command. ", + f"To clone a project without Git, copy the files from the '{name}' " + f"directory in the {repo} to {dest} manually and then run:", + f"{COMMAND} project init {dest}", + exits=1, + ) + if not dest: + msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) + if dest.exists(): + # Directory already exists (not allowed, clone needs to create it) + msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) + if not dest.parent.exists(): + # We're not creating parents, parent dir should exist + msg.fail( + f"Can't clone project, parent directory doesn't exist: {dest.parent}", + exits=1, + ) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py new file mode 100644 index 000000000..c29618820 --- /dev/null +++ b/spacy/cli/project/dvc.py @@ -0,0 +1,208 @@ +"""This module contains helpers and subcommands for integrating spaCy projects +with Data Version Controk (DVC). https://dvc.org""" +from typing import Dict, Any, List, Optional +import subprocess +from pathlib import Path +from wasabi import msg + +from .util import PROJECT_FILE, load_project_config, get_hash +from .._app import project_cli, Arg, Opt, NAME, COMMAND +from ...util import working_dir, split_command, join_command, run_command + + +DVC_CONFIG = "dvc.yaml" +DVC_DIR = ".dvc" +UPDATE_COMMAND = "dvc" +DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've +# edited your {PROJECT_FILE}, you can regenerate this file by running: +# {COMMAND} project {UPDATE_COMMAND}""" + + +@project_cli.command(UPDATE_COMMAND) +def project_update_dvc_cli( + # fmt: off + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), + workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), + verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), + # fmt: on +): + """Auto-generate Data Version Control (DVC) config. A DVC + project can only define one pipeline, so you need to specify one workflow + defined in the project.yml. If no workflow is specified, the first defined + workflow is used. The DVC config will only be updated if the project.yml changed. + """ + project_update_dvc(project_dir, workflow, verbose=verbose, force=force) + + +def project_update_dvc( + project_dir: Path, + workflow: Optional[str] = None, + *, + verbose: bool = False, + force: bool = False, +) -> None: + """Update the auto-generated Data Version Control (DVC) config file. A DVC + project can only define one pipeline, so you need to specify one workflow + defined in the project.yml. Will only update the file if the checksum changed. + + project_dir (Path): The project directory. + workflow (Optional[str]): Optional name of workflow defined in project.yml. + If not set, the first workflow will be used. + verbose (bool): Print more info. + force (bool): Force update DVC config. + """ + config = load_project_config(project_dir) + updated = update_dvc_config( + project_dir, config, workflow, verbose=verbose, force=force + ) + help_msg = "To execute the workflow with DVC, run: dvc repro" + if updated: + msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) + else: + msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg) + + +def update_dvc_config( + path: Path, + config: Dict[str, Any], + workflow: Optional[str] = None, + verbose: bool = False, + silent: bool = False, + force: bool = False, +) -> bool: + """Re-run the DVC commands in dry mode and update dvc.yaml file in the + project directory. The file is auto-generated based on the config. The + first line of the auto-generated file specifies the hash of the config + dict, so if any of the config values change, the DVC config is regenerated. + + path (Path): The path to the project directory. + config (Dict[str, Any]): The loaded project.yml. + verbose (bool): Whether to print additional info (via DVC). + silent (bool): Don't output anything (via DVC). + force (bool): Force update, even if hashes match. + RETURNS (bool): Whether the DVC config file was updated. + """ + ensure_dvc(path) + workflows = config.get("workflows", {}) + workflow_names = list(workflows.keys()) + check_workflows(workflow_names, workflow) + if not workflow: + workflow = workflow_names[0] + config_hash = get_hash(config) + path = path.resolve() + dvc_config_path = path / DVC_CONFIG + if dvc_config_path.exists(): + # Check if the file was generated using the current config, if not, redo + with dvc_config_path.open("r", encoding="utf8") as f: + ref_hash = f.readline().strip().replace("# ", "") + if ref_hash == config_hash and not force: + return False # Nothing has changed in project.yml, don't need to update + dvc_config_path.unlink() + variables = config.get("variables", {}) + dvc_commands = [] + config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + for name in workflows[workflow]: + command = config_commands[name] + deps = command.get("deps", []) + outputs = command.get("outputs", []) + outputs_no_cache = command.get("outputs_no_cache", []) + if not deps and not outputs and not outputs_no_cache: + continue + # Default to the working dir as the project path since dvc.yaml is auto-generated + # and we don't want arbitrary paths in there + project_cmd = ["python", "-m", NAME, "project", "run", name] + deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] + outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] + outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] + dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] + if command.get("no_skip"): + dvc_cmd.append("--always-changed") + full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] + dvc_commands.append(join_command(full_cmd)) + with working_dir(path): + dvc_flags = {"--verbose": verbose, "--quiet": silent} + run_dvc_commands(dvc_commands, variables, flags=dvc_flags) + with dvc_config_path.open("r+", encoding="utf8") as f: + content = f.read() + f.seek(0, 0) + f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") + return True + + +def run_dvc_commands( + commands: List[str] = tuple(), + variables: Dict[str, str] = {}, + flags: Dict[str, bool] = {}, +) -> None: + """Run a sequence of DVC commands in a subprocess, in order. + + commands (List[str]): The string commands without the leading "dvc". + variables (Dict[str, str]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + flags (Dict[str, bool]): Conditional flags to be added to command. Makes it + easier to pass flags like --quiet that depend on a variable or + command-line setting while avoiding lots of nested conditionals. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = split_command(command) + dvc_command = ["dvc", *command] + # Add the flags if they are set to True + for flag, is_active in flags.items(): + if is_active: + dvc_command.append(flag) + run_command(dvc_command) + + +def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: + """Validate workflows provided in project.yml and check that a given + workflow can be used to generate a DVC config. + + workflows (List[str]): Names of the available workflows. + workflow (Optional[str]): The name of the workflow to convert. + """ + if not workflows: + msg.fail( + f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " + f"define at least one list of commands.", + exits=1, + ) + if workflow is not None and workflow not in workflows: + msg.fail( + f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " + f"Available workflows: {', '.join(workflows)}", + exits=1, + ) + if not workflow: + msg.warn( + f"No workflow specified for DVC pipeline. Using the first workflow " + f"defined in {PROJECT_FILE}: '{workflows[0]}'" + ) + + +def ensure_dvc(project_dir: Path) -> None: + """Ensure that the "dvc" command is available and that the current project + directory is an initialized DVC project. + """ + try: + subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + "To use spaCy projects with DVC (Data Version Control), DVC needs " + "to be installed and the 'dvc' command needs to be available", + "You can install the Python package from pip (pip install dvc) or " + "conda (conda install -c conda-forge dvc). For more details, see the " + "documentation: https://dvc.org/doc/install", + exits=1, + ) + if not (project_dir / ".dvc").exists(): + msg.fail( + "Project not initialized as a DVC project", + "To initialize a DVC project, you can run 'dvc init' in the project " + "directory. For more details, see the documentation: " + "https://dvc.org/doc/command-reference/init", + exits=1, + ) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py new file mode 100644 index 000000000..a8cc58c01 --- /dev/null +++ b/spacy/cli/project/run.py @@ -0,0 +1,266 @@ +from typing import Optional, List, Dict, Sequence, Any +from pathlib import Path +from wasabi import msg +import sys +import srsly + +from ...util import working_dir, run_command, split_command, is_cwd, join_command +from .._app import project_cli, Arg, Opt, COMMAND +from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash +from .util import get_checksum + + +@project_cli.command("run") +def project_run_cli( + # fmt: off + subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), + force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), + dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run a named command or workflow defined in the project.yml. If a workflow + name is specified, all commands in the workflow are run, in order. If + commands define dependencies and/or outputs, they will only be re-run if + state has changed. + """ + if show_help or not subcommand: + print_run_help(project_dir, subcommand) + else: + project_run(project_dir, subcommand, force=force, dry=dry) + + +def project_run( + project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False +) -> None: + """Run a named script defined in the project.yml. If the script is part + of the default pipeline (defined in the "run" section), DVC is used to + execute the command, so it can determine whether to rerun it. It then + calls into "exec" to execute it. + + project_dir (Path): Path to project directory. + subcommand (str): Name of command to run. + force (bool): Force re-running, even if nothing changed. + dry (bool): Perform a dry run and don't execute commands. + """ + config = load_project_config(project_dir) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + workflows = config.get("workflows", {}) + validate_subcommand(commands.keys(), workflows.keys(), subcommand) + if subcommand in workflows: + msg.info(f"Running workflow '{subcommand}'") + for cmd in workflows[subcommand]: + project_run(project_dir, cmd, force=force, dry=dry) + else: + cmd = commands[subcommand] + variables = config.get("variables", {}) + for dep in cmd.get("deps", []): + dep = dep.format(**variables) + if not (project_dir / dep).exists(): + err = f"Missing dependency specified by command '{subcommand}': {dep}" + err_kwargs = {"exits": 1} if not dry else {} + msg.fail(err, **err_kwargs) + with working_dir(project_dir) as current_dir: + rerun = check_rerun(current_dir, cmd, variables) + if not rerun and not force: + msg.info(f"Skipping '{cmd['name']}': nothing changed") + else: + msg.divider(subcommand) + run_commands(cmd["script"], variables, dry=dry) + if not dry: + update_lockfile(current_dir, cmd, variables) + + +def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: + """Simulate a CLI help prompt using the info available in the project.yml. + + project_dir (Path): The project directory. + subcommand (Optional[str]): The subcommand or None. If a subcommand is + provided, the subcommand help is shown. Otherwise, the top-level help + and a list of available commands is printed. + """ + config = load_project_config(project_dir) + config_commands = config.get("commands", []) + commands = {cmd["name"]: cmd for cmd in config_commands} + workflows = config.get("workflows", {}) + project_loc = "" if is_cwd(project_dir) else project_dir + if subcommand: + validate_subcommand(commands.keys(), workflows.keys(), subcommand) + print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") + if subcommand in commands: + help_text = commands[subcommand].get("help") + if help_text: + print(f"\n{help_text}\n") + elif subcommand in workflows: + steps = workflows[subcommand] + print(f"\nWorkflow consisting of {len(steps)} commands:") + steps_data = [ + (f"{i + 1}. {step}", commands[step].get("help", "")) + for i, step in enumerate(steps) + ] + msg.table(steps_data) + help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help" + print(f"For command details, run: {help_cmd}") + else: + print("") + if config_commands: + print(f"Available commands in {PROJECT_FILE}") + print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + if workflows: + print(f"Available workflows in {PROJECT_FILE}") + print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}") + msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()]) + + +def run_commands( + commands: List[str] = tuple(), + variables: Dict[str, Any] = {}, + silent: bool = False, + dry: bool = False, +) -> None: + """Run a sequence of commands in a subprocess, in order. + + commands (List[str]): The string commands. + variables (Dict[str, Any]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + silent (bool): Don't print the commands. + dry (bool): Perform a dry run and don't execut anything. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = split_command(command) + # Not sure if this is needed or a good idea. Motivation: users may often + # use commands in their config that reference "python" and we want to + # make sure that it's always executing the same Python that spaCy is + # executed with and the pip in the same env, not some other Python/pip. + # Also ensures cross-compatibility if user 1 writes "python3" (because + # that's how it's set up on their system), and user 2 without the + # shortcut tries to re-run the command. + if len(command) and command[0] in ("python", "python3"): + command[0] = sys.executable + elif len(command) and command[0] in ("pip", "pip3"): + command = [sys.executable, "-m", "pip", *command[1:]] + if not silent: + print(f"Running command: {join_command(command)}") + if not dry: + run_command(command) + + +def validate_subcommand( + commands: Sequence[str], workflows: Sequence[str], subcommand: str +) -> None: + """Check that a subcommand is valid and defined. Raises an error otherwise. + + commands (Sequence[str]): The available commands. + subcommand (str): The subcommand. + """ + if not commands and not workflows: + msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1) + if subcommand not in commands and subcommand not in workflows: + help_msg = [] + if commands: + help_msg.append(f"Available commands: {', '.join(commands)}") + if workflows: + help_msg.append(f"Available workflows: {', '.join(workflows)}") + msg.fail( + f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}", + ". ".join(help_msg), + exits=1, + ) + + +def check_rerun( + project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] +) -> bool: + """Check if a command should be rerun because its settings or inputs/outputs + changed. + + project_dir (Path): The current project directory. + command (Dict[str, Any]): The command, as defined in the project.yml. + variables (Dict[str, Any]): The variables defined in the project.yml. + RETURNS (bool): Whether to re-run the command. + """ + lock_path = project_dir / PROJECT_LOCK + if not lock_path.exists(): # We don't have a lockfile, run command + return True + data = srsly.read_yaml(lock_path) + if command["name"] not in data: # We don't have info about this command + return True + entry = data[command["name"]] + # Always run commands with no outputs (otherwise they'd always be skipped) + if not entry.get("outs", []): + return True + # If the entry in the lockfile matches the lockfile entry that would be + # generated from the current command, we don't rerun because it means that + # all inputs/outputs, hashes and scripts are the same and nothing changed + return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry) + + +def update_lockfile( + project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] +) -> None: + """Update the lockfile after running a command. Will create a lockfile if + it doesn't yet exist and will add an entry for the current command, its + script and dependencies/outputs. + + project_dir (Path): The current project directory. + command (Dict[str, Any]): The command, as defined in the project.yml. + variables (Dict[str, Any]): The variables defined in the project.yml. + """ + lock_path = project_dir / PROJECT_LOCK + if not lock_path.exists(): + srsly.write_yaml(lock_path, {}) + data = {} + else: + data = srsly.read_yaml(lock_path) + data[command["name"]] = get_lock_entry(project_dir, command, variables) + srsly.write_yaml(lock_path, data) + + +def get_lock_entry( + project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] +) -> Dict[str, Any]: + """Get a lockfile entry for a given command. An entry includes the command, + the script (command steps) and a list of dependencies and outputs with + their paths and file hashes, if available. The format is based on the + dvc.lock files, to keep things consistent. + + project_dir (Path): The current project directory. + command (Dict[str, Any]): The command, as defined in the project.yml. + variables (Dict[str, Any]): The variables defined in the project.yml. + RETURNS (Dict[str, Any]): The lockfile entry. + """ + deps = get_fileinfo(project_dir, command.get("deps", []), variables) + outs = get_fileinfo(project_dir, command.get("outputs", []), variables) + outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables) + return { + "cmd": f"{COMMAND} run {command['name']}", + "script": command["script"], + "deps": deps, + "outs": [*outs, *outs_nc], + } + + +def get_fileinfo( + project_dir: Path, paths: List[str], variables: Dict[str, Any] +) -> List[Dict[str, str]]: + """Generate the file information for a list of paths (dependencies, outputs). + Includes the file path and the file's checksum. + + project_dir (Path): The current project directory. + paths (List[str]): The file paths. + variables (Dict[str, Any]): The variables defined in the project.yml. + RETURNS (List[Dict[str, str]]): The lockfile entry for a file. + """ + data = [] + for path in paths: + path = path.format(**variables) + file_path = project_dir / path + md5 = get_checksum(file_path) if file_path.exists() else None + data.append({"path": path, "md5": md5}) + return data diff --git a/spacy/cli/project/util.py b/spacy/cli/project/util.py new file mode 100644 index 000000000..1111ddc2d --- /dev/null +++ b/spacy/cli/project/util.py @@ -0,0 +1,93 @@ +from typing import Dict, Any, Union +from pathlib import Path +from wasabi import msg +import srsly +import hashlib + +from ...schemas import ProjectConfigSchema, validate + + +PROJECT_FILE = "project.yml" +PROJECT_LOCK = "project.lock" + + +def load_project_config(path: Path) -> Dict[str, Any]: + """Load the project.yml file from a directory and validate it. Also make + sure that all directories defined in the config exist. + + path (Path): The path to the project directory. + RETURNS (Dict[str, Any]): The loaded project.yml. + """ + config_path = path / PROJECT_FILE + if not config_path.exists(): + msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) + invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." + try: + config = srsly.read_yaml(config_path) + except ValueError as e: + msg.fail(invalid_err, e, exits=1) + errors = validate(ProjectConfigSchema, config) + if errors: + msg.fail(invalid_err, "\n".join(errors), exits=1) + validate_project_commands(config) + # Make sure directories defined in config exist + for subdir in config.get("directories", []): + dir_path = path / subdir + if not dir_path.exists(): + dir_path.mkdir(parents=True) + return config + + +def validate_project_commands(config: Dict[str, Any]) -> None: + """Check that project commands and workflows are valid, don't contain + duplicates, don't clash and only refer to commands that exist. + + config (Dict[str, Any]): The loaded config. + """ + command_names = [cmd["name"] for cmd in config.get("commands", [])] + workflows = config.get("workflows", {}) + duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) + if duplicates: + err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" + msg.fail(err, exits=1) + for workflow_name, workflow_steps in workflows.items(): + if workflow_name in command_names: + err = f"Can't use workflow name '{workflow_name}': name already exists as a command" + msg.fail(err, exits=1) + for step in workflow_steps: + if step not in command_names: + msg.fail( + f"Unknown command specified in workflow '{workflow_name}': {step}", + f"Workflows can only refer to commands defined in the 'commands' " + f"section of the {PROJECT_FILE}.", + exits=1, + ) + + +def get_hash(data) -> str: + """Get the hash for a JSON-serializable object. + + data: The data to hash. + RETURNS (str): The hash. + """ + data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") + return hashlib.md5(data_str).hexdigest() + + +def get_checksum(path: Union[Path, str]) -> str: + """Get the checksum for a file or directory given its file path. If a + directory path is provided, this uses all files in that directory. + + path (Union[Path, str]): The file or directory path. + RETURNS (str): The checksum. + """ + path = Path(path) + if path.is_file(): + return hashlib.md5(Path(path).read_bytes()).hexdigest() + if path.is_dir(): + # TODO: this is currently pretty slow + dir_checksum = hashlib.md5() + for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): + dir_checksum.update(sub_file.read_bytes()) + return dir_checksum.hexdigest() + raise ValueError(f"Can't get checksum for {path}: not a file or directory") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index f09592f77..d65b2de12 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -121,14 +121,14 @@ class ConfigSchema(BaseModel): @app.command("train") def train_cli( # fmt: off - train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), - dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), + train_path: Path = Arg(..., help="Location of training data", exists=True), + dev_path: Path = Arg(..., help="Location of development data", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), - verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), num_workers: int = Opt(None, "-j", help="Parallel Workers"), strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"), @@ -155,6 +155,7 @@ def train_cli( if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: weights_data = file_.read() + train_args = dict( config_path=config_path, data_paths={"train": train_path, "dev": dev_path}, @@ -170,7 +171,7 @@ def train_cli( distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args) else: if use_gpu >= 0: - msg.info(f"Using GPU: {str(use_gpu)}") + msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) else: msg.info("Using CPU") @@ -191,7 +192,8 @@ def train( msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) - fix_random_seed(config["training"]["seed"]) + if config["training"].get("seed"): + fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() @@ -216,7 +218,10 @@ def train( msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") train_examples = list( corpus.train_dataset( - nlp, shuffle=False, gold_preproc=training["gold_preproc"] + nlp, + shuffle=False, + gold_preproc=training["gold_preproc"], + max_length=training["max_length"], ) ) nlp.begin_training(lambda: train_examples) @@ -315,6 +320,7 @@ def create_train_batches(nlp, corpus, cfg, randomization_index): ) epoch = 0 + batch_strategy = cfg.get("batch_by", "sequences") while True: if len(train_examples) == 0: raise ValueError(Errors.E988) @@ -324,11 +330,22 @@ def create_train_batches(nlp, corpus, cfg, randomization_index): random.random() random.shuffle(train_examples) epoch += 1 - batches = util.minibatch_by_words( - train_examples, - size=cfg["batch_size"], - discard_oversize=cfg["discard_oversize"], - ) + if batch_strategy == "padded": + batches = util.minibatch_by_padded_size( + train_examples, + size=cfg["batch_size"], + buffer=256, + discard_oversize=cfg["discard_oversize"], + ) + elif batch_strategy == "words": + batches = util.minibatch_by_words( + train_examples, + size=cfg["batch_size"], + discard_oversize=cfg["discard_oversize"], + ) + else: + batches = util.minibatch(train_examples, size=cfg["batch_size"]) + # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop try: first = next(batches) @@ -440,7 +457,9 @@ def train_while_improving( if raw_text: random.shuffle(raw_text) - raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text] + raw_examples = [ + Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text + ] raw_batches = util.minibatch(raw_examples, size=8) for step, (epoch, batch) in enumerate(train_data): diff --git a/spacy/errors.py b/spacy/errors.py index 31533e7e2..fa432382d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -69,6 +69,9 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") + W028 = ("Doc.from_array was called with a vector of type '{type}', " + "but is expecting one of type 'uint64' instead. This may result " + "in problems with the vocab further on in the pipeline.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" @@ -477,15 +480,14 @@ class Errors(object): E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E969 = ("Expected string values for field '{field}', but received {types} instead. ") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " "array and {doc_length} for the Doc itself.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E973 = ("Unexpected type for NER data") E974 = ("Unknown {obj} attribute: {key}") - E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " - "but got {type}") - E976 = ("The method 'Example.from_dict' expects a dict as second argument, " + E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, " "but received None.") E977 = ("Can not compare a MorphAnalysis with a string object. " "This is likely a bug in spaCy, so feel free to open an issue.") diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index ce1a0928b..355578de3 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cdef class Example: def __init__(self, Doc predicted, Doc reference, *, alignment=None): - """ Doc can either be text, or an actual Doc """ if predicted is None: raise TypeError(Errors.E972.format(arg="predicted")) if reference is None: @@ -37,6 +36,9 @@ cdef class Example: self.y = reference self._alignment = alignment + def __len__(self): + return len(self.predicted) + property predicted: def __get__(self): return self.x @@ -59,17 +61,15 @@ cdef class Example: @classmethod def from_dict(cls, Doc predicted, dict example_dict): + if predicted is None: + raise ValueError(Errors.E976.format(n="first", type="Doc")) if example_dict is None: - raise ValueError(Errors.E976) - if not isinstance(predicted, Doc): - raise TypeError(Errors.E975.format(type=type(predicted))) + raise ValueError(Errors.E976.format(n="second", type="dict")) example_dict = _fix_legacy_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict) if "ORTH" not in tok_dict: tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] - if not _has_field(tok_dict, "SPACY"): - spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) return Example( predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) @@ -257,7 +257,11 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([vocab.morphology.add(v) for v in value]) else: attrs.append(key) - values.append([vocab.strings.add(v) for v in value]) + try: + values.append([vocab.strings.add(v) for v in value]) + except TypeError: + types= set([type(v) for v in value]) + raise TypeError(Errors.E969.format(field=key, types=types)) array = numpy.asarray(values, dtype="uint64") return attrs, array.T @@ -325,8 +329,8 @@ def _fix_legacy_dict_data(example_dict): for key, value in old_token_dict.items(): if key in ("text", "ids", "brackets"): pass - elif key in remapping: - token_dict[remapping[key]] = value + elif key.lower() in remapping: + token_dict[remapping[key.lower()]] = value else: raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) text = example_dict.get("text", example_dict.get("raw")) diff --git a/spacy/language.py b/spacy/language.py index da45c058c..32c8512fc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -513,20 +513,23 @@ class Language(object): ): """Update the models in the pipeline. - examples (iterable): A batch of `Example` objects. + examples (Iterable[Example]): A batch of examples dummy: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. - sgd (callable): An optimizer. - losses (dict): Dictionary to update with the loss, keyed by component. - component_cfg (dict): Config parameters for specific pipeline + sgd (Optimizer): An optimizer. + losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. + component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. + RETURNS (Dict[str, float]): The updated losses dictionary DOCS: https://spacy.io/api/language#update """ if dummy is not None: raise ValueError(Errors.E989) + if losses is None: + losses = {} if len(examples) == 0: - return + return losses if not isinstance(examples, Iterable): raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples))) wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) @@ -540,22 +543,19 @@ class Language(object): if component_cfg is None: component_cfg = {} - component_deps = count_pipeline_interdependencies(self.pipeline) - # Determine whether component should set annotations. In theory I guess - # we should do this by inspecting the meta? Or we could just always - # say "yes" for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) component_cfg[name].setdefault("drop", drop) - component_cfg[name]["set_annotations"] = bool(component_deps[i]) + component_cfg[name].setdefault("set_annotations", False) for name, proc in self.pipeline: if not hasattr(proc, "update"): continue proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) - if sgd is not False: + if sgd not in (None, False): for name, proc in self.pipeline: if hasattr(proc, "model"): proc.model.finish_update(sgd) + return losses def rehearse(self, examples, sgd=None, losses=None, config=None): """Make a "rehearsal" update to the models in the pipeline, to prevent @@ -761,18 +761,17 @@ class Language(object): ): """Process texts as a stream, and yield `Doc` objects in order. - texts (iterator): A sequence of texts to process. + texts (Iterable[str]): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. batch_size (int): The number of texts to buffer. - disable (list): Names of the pipeline components to disable. + disable (List[str]): Names of the pipeline components to disable. cleanup (bool): If True, unneeded strings are freed to control memory use. Experimental. - component_cfg (dict): An optional dictionary with extra keyword + component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword arguments for specific components. - n_process (int): Number of processors to process texts, only supported - in Python3. If -1, set `multiprocessing.cpu_count()`. + n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`. YIELDS (Doc): Documents in the order of the original text. DOCS: https://spacy.io/api/language#pipe diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index 3b5f09e7b..a3e2633e9 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -1,13 +1,14 @@ from thinc.api import Model, normal_init -def PrecomputableAffine(nO, nI, nF, nP): +def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): model = Model( "precomputable_affine", forward, init=init, dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, params={"W": None, "b": None, "pad": None}, + attrs={"dropout_rate": dropout} ) return model @@ -48,17 +49,14 @@ def forward(model, X, is_train): model.inc_grad("b", dY.sum(axis=0)) dY = dY.reshape((dY.shape[0], nO * nP)) - Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3))) + Wopfi = W.transpose((1, 2, 0, 3)) Wopfi = Wopfi.reshape((nO * nP, nF * nI)) dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) - # Reuse the buffer - dWopfi = Wopfi - dWopfi.fill(0.0) - model.ops.gemm(dY, Xf, out=dWopfi, trans1=True) + dWopfi = model.ops.gemm(dY, Xf, trans1=True) dWopfi = dWopfi.reshape((nO, nP, nF, nI)) # (o, p, f, i) --> (f, o, p, i) - dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3))) + dWopfi = dWopfi.transpose((2, 0, 1, 3)) model.inc_grad("W", dWopfi) return dXf.reshape((dXf.shape[0], nF, nI)) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 0d6834f36..879cac2ec 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -87,16 +87,16 @@ def build_text_classifier( cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): lower = HashEmbed( - nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout + nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10 ) prefix = HashEmbed( - nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout + nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11 ) suffix = HashEmbed( - nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout + nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12 ) shape = HashEmbed( - nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout + nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13 ) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index d2b70c36e..2e03d4620 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces): def MultiHashEmbed( columns, width, rows, use_subwords, pretrained_vectors, mix, dropout ): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6) if use_subwords: prefix = HashEmbed( - nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout + nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7 ) suffix = HashEmbed( - nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout + nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8 ) shape = HashEmbed( - nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout + nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9 ) if pretrained_vectors: @@ -192,7 +192,7 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5) chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) with Model.define_operators({">>": chain, "|": concatenate}): embed_layer = chr_embed | features >> with_array(norm) @@ -263,20 +263,20 @@ def build_Tok2Vec_model( cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): norm = HashEmbed( - nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout, + nO=width, nV=embed_size, column=cols.index(NORM), dropout=None, seed=0 ) if subword_features: prefix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout, + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None, seed=1 ) suffix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout, + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None, seed=2 ) shape = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout, + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None, seed=3 ) else: @@ -296,7 +296,7 @@ def build_Tok2Vec_model( >> Maxout( nO=width, nI=width * columns, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ), @@ -309,7 +309,7 @@ def build_Tok2Vec_model( >> Maxout( nO=width, nI=width * columns, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ), @@ -322,7 +322,7 @@ def build_Tok2Vec_model( >> Maxout( nO=width, nI=width * columns, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ), @@ -335,7 +335,7 @@ def build_Tok2Vec_model( reduce_dimensions = Maxout( nO=width, nI=nM * nC + width, - nP=maxout_pieces, + nP=3, dropout=0.0, normalize=True, ) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 88f27f0bf..39d4b0a14 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear from ..syntax._parser_model import ParserStepModel -def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): +def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()): """Set up a stepwise transition-based model""" if upper is None: has_upper = False diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 78e8e17c0..a3aa8be22 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -272,7 +272,7 @@ cdef class Morphology: @staticmethod def feats_to_dict(feats): - if not feats: + if not feats or feats == Morphology.EMPTY_MORPH: return {} return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index f792d57b0..57b778434 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -3,7 +3,7 @@ cimport numpy as np import numpy import srsly -from thinc.api import to_categorical +from thinc.api import SequenceCategoricalCrossentropy from ..tokens.doc cimport Doc from ..vocab cimport Vocab @@ -85,13 +85,10 @@ class Morphologizer(Tagger): doc.is_morphed = True def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = {tag: i for i, tag in enumerate(self.labels)} - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + truths = [] for eg in examples: + eg_truths = [] pos_tags = eg.get_aligned("POS", as_string=True) morphs = eg.get_aligned("MORPH", as_string=True) for i in range(len(morphs)): @@ -104,20 +101,11 @@ class Morphologizer(Tagger): morph = self.vocab.strings[self.vocab.morphology.add(feats)] if morph == "": morph = Morphology.EMPTY_MORPH - if morph is None: - correct[idx] = guesses[idx] - elif morph in tag_index: - correct[idx] = tag_index[morph] - else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [eg.predicted for eg in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + eg_truths.append(morph) + truths.append(eg_truths) + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def to_bytes(self, exclude=tuple()): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 61cf155a2..c35cb4b68 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -58,12 +58,8 @@ class Pipe(object): Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - predictions = self.predict([doc]) - if isinstance(predictions, tuple) and len(predictions) == 2: - scores, tensors = predictions - self.set_annotations([doc], scores, tensors=tensors) - else: - self.set_annotations([doc], predictions) + scores = self.predict([doc]) + self.set_annotations([doc], scores) return doc def pipe(self, stream, batch_size=128): @@ -73,12 +69,8 @@ class Pipe(object): and `set_annotations()` methods. """ for docs in util.minibatch(stream, size=batch_size): - predictions = self.predict(docs) - if isinstance(predictions, tuple) and len(tuple) == 2: - scores, tensors = predictions - self.set_annotations(docs, scores, tensors=tensors) - else: - self.set_annotations(docs, predictions) + scores = self.predict(docs) + self.set_annotations(docs, scores) yield from docs def predict(self, docs): @@ -87,7 +79,7 @@ class Pipe(object): """ raise NotImplementedError - def set_annotations(self, docs, scores, tensors=None): + def set_annotations(self, docs, scores): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError @@ -281,9 +273,10 @@ class Tagger(Pipe): idx += 1 doc.is_tagged = True - def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): - if losses is not None and self.name not in losses: - losses[self.name] = 0. + def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) try: if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): @@ -303,11 +296,11 @@ class Tagger(Pipe): if sgd not in (None, False): self.model.finish_update(sgd) - if losses is not None: - losses[self.name] += loss + losses[self.name] += loss if set_annotations: docs = [eg.predicted for eg in examples] self.set_annotations(docs, self._scores2guesses(tag_scores)) + return losses def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of @@ -334,7 +327,7 @@ class Tagger(Pipe): losses[self.name] += (gradient**2).sum() def get_loss(self, examples, scores): - loss_func = SequenceCategoricalCrossentropy(names=self.labels) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) truths = [eg.get_aligned("tag", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): @@ -521,29 +514,23 @@ class SentenceRecognizer(Tagger): doc.c[j].sent_start = -1 def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = range(len(self.labels)) - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + labels = self.labels + loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) + truths = [] for eg in examples: - sent_starts = eg.get_aligned("sent_start") - for sent_start in sent_starts: - if sent_start is None: - correct[idx] = guesses[idx] - elif sent_start in tag_index: - correct[idx] = sent_start + eg_truth = [] + for x in eg.get_aligned("sent_start"): + if x == None: + eg_truth.append(None) + elif x == 1: + eg_truth.append(labels[1]) else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [eg.predicted for eg in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + # anything other than 1: 0, -1, -1 as uint64 + eg_truth.append(labels[0]) + truths.append(eg_truth) + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, @@ -641,7 +628,7 @@ class MultitaskObjective(Tagger): def labels(self, value): self.cfg["labels"] = value - def set_annotations(self, docs, dep_ids, tensors=None): + def set_annotations(self, docs, dep_ids): pass def begin_training(self, get_examples=lambda: [], pipeline=None, @@ -738,7 +725,7 @@ class ClozeMultitask(Pipe): self.cfg = cfg self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config - def set_annotations(self, docs, dep_ids, tensors=None): + def set_annotations(self, docs, dep_ids): pass def begin_training(self, get_examples=lambda: [], pipeline=None, @@ -767,7 +754,7 @@ class ClozeMultitask(Pipe): loss = self.distance.get_loss(prediction, target) return loss, gradient - def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): + def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): @@ -815,8 +802,8 @@ class TextCategorizer(Pipe): def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): - scores, tensors = self.predict(docs) - self.set_annotations(docs, scores, tensors=tensors) + scores = self.predict(docs) + self.set_annotations(docs, scores) yield from docs def predict(self, docs): @@ -826,22 +813,25 @@ class TextCategorizer(Pipe): # Handle cases where there are no tokens in any docs. xp = get_array_module(tensors) scores = xp.zeros((len(docs), len(self.labels))) - return scores, tensors + return scores scores = self.model.predict(docs) scores = self.model.ops.asarray(scores) - return scores, tensors + return scores - def set_annotations(self, docs, scores, tensors=None): + def set_annotations(self, docs, scores): for i, doc in enumerate(docs): for j, label in enumerate(self.labels): doc.cats[label] = float(scores[i, j]) - def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): + def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) try: if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. - return + return losses except AttributeError: types = set([type(eg) for eg in examples]) raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) @@ -853,12 +843,11 @@ class TextCategorizer(Pipe): bp_scores(d_scores) if sgd is not None: self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss + losses[self.name] += loss if set_annotations: docs = [eg.predicted for eg in examples] self.set_annotations(docs, scores=scores) + return losses def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: @@ -1082,12 +1071,13 @@ class EntityLinker(Pipe): sgd = self.create_optimizer() return sgd - def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): + def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None): self.require_kb() - if losses is not None: - losses.setdefault(self.name, 0.0) + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) if not examples: - return 0 + return losses sentence_docs = [] try: docs = [eg.predicted for eg in examples] @@ -1130,20 +1120,19 @@ class EntityLinker(Pipe): return 0.0 sentence_encodings, bp_context = self.model.begin_update(sentence_docs) loss, d_scores = self.get_similarity_loss( - scores=sentence_encodings, + sentence_encodings=sentence_encodings, examples=examples ) bp_context(d_scores) if sgd is not None: self.model.finish_update(sgd) - if losses is not None: - losses[self.name] += loss + losses[self.name] += loss if set_annotations: self.set_annotations(docs, predictions) - return loss + return losses - def get_similarity_loss(self, examples, scores): + def get_similarity_loss(self, examples, sentence_encodings): entity_encodings = [] for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) @@ -1155,41 +1144,23 @@ class EntityLinker(Pipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - if scores.shape != entity_encodings.shape: + if sentence_encodings.shape != entity_encodings.shape: raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) - gradients = self.distance.get_grad(scores, entity_encodings) - loss = self.distance.get_loss(scores, entity_encodings) + gradients = self.distance.get_grad(sentence_encodings, entity_encodings) + loss = self.distance.get_loss(sentence_encodings, entity_encodings) loss = loss / len(entity_encodings) return loss, gradients - def get_loss(self, examples, scores): - cats = [] - for eg in examples: - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.predicted.ents: - kb_id = kb_ids[ent.start] - if kb_id: - cats.append([1.0]) - - cats = self.model.ops.asarray(cats, dtype="float32") - if len(scores) != len(cats): - raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up")) - - d_scores = (scores - cats) - loss = (d_scores ** 2).sum() - loss = loss / len(cats) - return loss, d_scores - def __call__(self, doc): - kb_ids, tensors = self.predict([doc]) - self.set_annotations([doc], kb_ids, tensors=tensors) + kb_ids = self.predict([doc]) + self.set_annotations([doc], kb_ids) return doc def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): - kb_ids, tensors = self.predict(docs) - self.set_annotations(docs, kb_ids, tensors=tensors) + kb_ids = self.predict(docs) + self.set_annotations(docs, kb_ids) yield from docs def predict(self, docs): @@ -1197,10 +1168,9 @@ class EntityLinker(Pipe): self.require_kb() entity_count = 0 final_kb_ids = [] - final_tensors = [] if not docs: - return final_kb_ids, final_tensors + return final_kb_ids if isinstance(docs, Doc): docs = [docs] @@ -1234,21 +1204,18 @@ class EntityLinker(Pipe): if to_discard and ent.label_ in to_discard: # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) else: candidates = self.kb.get_candidates(ent.text) if not candidates: # no prediction possible for this entity - setting to NIL final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) elif len(candidates) == 1: # shortcut for efficiency reasons: take the 1 candidate # TODO: thresholding final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) else: random.shuffle(candidates) @@ -1277,14 +1244,13 @@ class EntityLinker(Pipe): best_index = scores.argmax().item() best_candidate = candidates[best_index] final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) - if not (len(final_tensors) == len(final_kb_ids) == entity_count): + if not (len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) - return final_kb_ids, final_tensors + return final_kb_ids - def set_annotations(self, docs, kb_ids, tensors=None): + def set_annotations(self, docs, kb_ids): count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) @@ -1400,11 +1366,7 @@ class Sentencizer(Pipe): def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) - if isinstance(predictions, tuple) and len(tuple) == 2: - scores, tensors = predictions - self.set_annotations(docs, scores, tensors=tensors) - else: - self.set_annotations(docs, predictions) + self.set_annotations(docs, predictions) yield from docs def predict(self, docs): @@ -1435,7 +1397,7 @@ class Sentencizer(Pipe): guesses.append(doc_guesses) return guesses - def set_annotations(self, docs, batch_tag_ids, tensors=None): + def set_annotations(self, docs, batch_tag_ids): if isinstance(docs, Doc): docs = [docs] cdef Doc doc diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index e4a1e15e9..bf5783b1a 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -57,7 +57,7 @@ class SimpleNER(Pipe): scores = self.model.predict(docs) return scores - def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None): + def set_annotations(self, docs: List[Doc], scores: List[Floats2d]): """Set entities on a batch of documents from a batch of scores.""" tag_names = self.get_tag_names() for i, doc in enumerate(docs): @@ -67,9 +67,12 @@ class SimpleNER(Pipe): tags = iob_to_biluo(tags) doc.ents = spans_from_biluo_tags(doc, tags) - def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): + def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None): + if losses is None: + losses = {} + losses.setdefault("ner", 0.0) if not any(_has_ner(eg) for eg in examples): - return 0 + return losses docs = [eg.predicted for eg in examples] set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) @@ -79,10 +82,8 @@ class SimpleNER(Pipe): self.set_annotations(docs, scores) if sgd is not None: self.model.finish_update(sgd) - if losses is not None: - losses.setdefault("ner", 0.0) - losses["ner"] += loss - return loss + losses["ner"] += loss + return losses def get_loss(self, examples, scores): loss = 0 diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index a06513a73..56afb3925 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -83,12 +83,14 @@ class Tok2Vec(Pipe): assert tokvecs.shape[0] == len(doc) doc.tensor = tokvecs - def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False): + def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False): """Update the model. - examples (iterable): A batch of examples + examples (Iterable[Example]): A batch of examples drop (float): The droput rate. - sgd (callable): An optimizer. - RETURNS (dict): Results from the update. + sgd (Optimizer): An optimizer. + losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. + set_annotations (bool): whether or not to update the examples with the predictions + RETURNS (Dict[str, float]): The updated losses dictionary """ if losses is None: losses = {} @@ -124,6 +126,7 @@ class Tok2Vec(Pipe): self.listeners[-1].receive(batch_id, tokvecs, backprop) if set_annotations: self.set_annotations(docs, tokvecs) + return losses def get_loss(self, docs, golds, scores): pass diff --git a/spacy/schemas.py b/spacy/schemas.py index 38e08b4cb..c67814dfd 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -222,7 +222,7 @@ class TrainingSchema(BaseModel): class ProjectConfigAsset(BaseModel): # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") - url: StrictStr = Field(..., title="URL of asset") + url: Optional[StrictStr] = Field(None, title="URL of asset") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") # fmt: on @@ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel): name: StrictStr = Field(..., title="Name of command") help: Optional[StrictStr] = Field(None, title="Command description") script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") - deps: List[StrictStr] = Field([], title="Data Version Control dependencies") - outputs: List[StrictStr] = Field([], title="Data Version Control outputs") - outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") + deps: List[StrictStr] = Field([], title="File dependencies required by this command") + outputs: List[StrictStr] = Field([], title="Outputs produced by this command") + outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)") + no_skip: bool = Field(False, title="Never skip this command, even if nothing changed") # fmt: on class Config: @@ -246,7 +247,7 @@ class ProjectConfigSchema(BaseModel): # fmt: off variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") assets: List[ProjectConfigAsset] = Field([], title="Data assets") - run: List[StrictStr] = Field([], title="Names of project commands to execute, in order") + workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") # fmt: on diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 853facdc6..7acee5efd 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserStepModel(Model): - def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True): + def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True, + dropout=0.1): Model.__init__(self, name="parser_step_model", forward=step_forward) self.attrs["has_upper"] = has_upper + self.attrs["dropout_rate"] = dropout self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) if layers[1].get_dim("nP") >= 2: activation = "maxout" @@ -243,6 +245,13 @@ class ParserStepModel(Model): for class_ in unseen_classes: self._class_mask[class_] = 0. + def clear_memory(self): + del self.tokvecs + del self.bp_tokvecs + del self.state2vec + del self.backprops + del self._class_mask + @property def nO(self): if self.attrs["has_upper"]: @@ -271,6 +280,19 @@ class ParserStepModel(Model): c_ids += ids.shape[1] return ids + def backprop_step(self, token_ids, d_vector, get_d_tokvecs): + if isinstance(self.state2vec.ops, CupyOps) \ + and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): + # Move token_ids and d_vector to GPU, asynchronously + self.backprops.append(( + util.get_async(self.cuda_stream, token_ids), + util.get_async(self.cuda_stream, d_vector), + get_d_tokvecs + )) + else: + self.backprops.append((token_ids, d_vector, get_d_tokvecs)) + + def finish_steps(self, golds): # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. @@ -289,11 +311,17 @@ class ParserStepModel(Model): self.bp_tokvecs(d_tokvecs[:-1]) return d_tokvecs +NUMPY_OPS = NumpyOps() def step_forward(model: ParserStepModel, states, is_train): token_ids = model.get_token_ids(states) vector, get_d_tokvecs = model.state2vec(token_ids, is_train) + mask = None if model.attrs["has_upper"]: + dropout_rate = model.attrs["dropout_rate"] + if is_train and dropout_rate > 0: + mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1) + vector *= mask scores, get_d_vector = model.vec2scores(vector, is_train) else: scores = NumpyOps().asarray(vector) @@ -305,16 +333,9 @@ def step_forward(model: ParserStepModel, states, is_train): # Zero vectors for unseen classes d_scores *= model._class_mask d_vector = get_d_vector(d_scores) - if isinstance(model.state2vec.ops, CupyOps) \ - and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - model.backprops.append(( - util.get_async(model.cuda_stream, token_ids), - util.get_async(model.cuda_stream, d_vector), - get_d_tokvecs - )) - else: - model.backprops.append((token_ids, d_vector, get_d_tokvecs)) + if mask is not None: + d_vector *= mask + model.backprop_step(token_ids, d_vector, get_d_tokvecs) return None return scores, backprop_parser_step @@ -437,7 +458,7 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector = state_vector + self.bias + state_vector += self.bias state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector_ids): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0295241c6..591afe5ab 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -65,7 +65,6 @@ cdef class Parser: self.set_output(self.moves.n_moves) self.cfg = dict(cfg) self.cfg.setdefault("update_with_oracle_cut_size", 100) - self.cfg.setdefault("normalize_gradients_with_batch_size", True) self._multitasks = [] for multitask in cfg.get("multitasks", []): self.add_multitask_objective(multitask) @@ -154,7 +153,7 @@ cdef class Parser: doc (Doc): The document to be processed. """ states = self.predict([doc]) - self.set_annotations([doc], states, tensors=None) + self.set_annotations([doc], states) return doc def pipe(self, docs, int batch_size=256): @@ -171,7 +170,7 @@ cdef class Parser: for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): subbatch = list(subbatch) parse_states = self.predict(subbatch) - self.set_annotations(subbatch, parse_states, tensors=None) + self.set_annotations(subbatch, parse_states) yield from batch_in_order def predict(self, docs): @@ -201,6 +200,8 @@ cdef class Parser: with nogil: self._parseC(&states[0], weights, sizes) + model.clear_memory() + del model return batch cdef void _parseC(self, StateC** states, @@ -223,7 +224,7 @@ cdef class Parser: unfinished.clear() free_activations(&activations) - def set_annotations(self, docs, states, tensors=None): + def set_annotations(self, docs, states): cdef StateClass state cdef Doc doc for i, (state, doc) in enumerate(zip(states, docs)): @@ -264,7 +265,7 @@ cdef class Parser: states[i].push_hist(guess) free(is_valid) - def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): + def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): cdef StateClass state if losses is None: losses = {} @@ -280,11 +281,12 @@ cdef class Parser: [eg.predicted for eg in examples]) if self.cfg["update_with_oracle_cut_size"] >= 1: # Chop sequences into lengths of this many transitions, to make the - # batch uniform length. We randomize this to overfit less. + # batch uniform length. + # We used to randomize this, but it's not clear that actually helps? cut_size = self.cfg["update_with_oracle_cut_size"] states, golds, max_steps = self._init_gold_batch( examples, - max_length=numpy.random.choice(range(5, cut_size)) + max_length=cut_size ) else: states, golds, _ = self.moves.init_gold_batch(examples) @@ -292,24 +294,15 @@ cdef class Parser: if not states: return losses all_states = list(states) - states_golds = zip(states, golds) - for _ in range(max_steps): - if not states_golds: - break + states_golds = list(zip(states, golds)) + while states_golds: states, golds = zip(*states_golds) scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) - if self.cfg["normalize_gradients_with_batch_size"]: - # We have to be very careful how we do this, because of the way we - # cut up the batch. We subdivide long sequences. If we normalize - # naively, we end up normalizing by sequence length, which - # is bad: that would mean that states in long sequences - # consistently get smaller gradients. Imagine if we have two - # sequences, one length 1000, one length 20. If we cut up - # the 1k sequence so that we have a "batch" of 50 subsequences, - # we don't want the gradients to get 50 times smaller! - d_scores /= n_examples - + # Note that the gradient isn't normalized by the batch size + # here, because our "samples" are really the states...But we + # can't normalize by the number of states either, as then we'd + # be getting smaller gradients for states in long sequences. backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) @@ -321,6 +314,13 @@ cdef class Parser: if set_annotations: docs = [eg.predicted for eg in examples] self.set_annotations(docs, all_states) + # Ugh, this is annoying. If we're working on GPU, we want to free the + # memory ASAP. It seems that Python doesn't necessarily get around to + # removing these in time if we don't explicitly delete? It's confusing. + del backprop + del backprop_tok2vec + model.clear_memory() + del model return losses def rehearse(self, examples, sgd=None, losses=None, **cfg): @@ -344,7 +344,7 @@ cdef class Parser: set_dropout_rate(self._rehearsal_model, 0.0) set_dropout_rate(self.model, 0.0) tutor, _ = self._rehearsal_model.begin_update(docs) - model, finish_update = self.model.begin_update(docs) + model, backprop_tok2vec = self.model.begin_update(docs) n_scores = 0. loss = 0. while states: @@ -360,10 +360,16 @@ cdef class Parser: states = [state for state in states if not state.is_final()] n_scores += d_scores.size # Do the backprop - finish_update(docs) + backprop_tok2vec(docs) if sgd is not None: self.model.finish_update(sgd) losses[self.name] += loss / n_scores + del backprop + del backprop_tok2vec + model.clear_memory() + tutor.clear_memory() + del model + del tutor return losses def get_gradients(self): @@ -407,6 +413,7 @@ cdef class Parser: cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) c_d_scores += d_scores.shape[1] + # Note that we don't normalize this. See comment in update() for why. if losses is not None: losses.setdefault(self.name, 0.) losses[self.name] += (d_scores**2).sum() @@ -525,21 +532,25 @@ cdef class Parser: StateClass state Transition action all_states = self.moves.init_batch([eg.predicted for eg in examples]) + states = [] + golds = [] kept = [] max_length_seen = 0 for state, eg in zip(all_states, examples): if self.moves.has_gold(eg) and not state.is_final(): gold = self.moves.init_gold(state, eg) - oracle_actions = self.moves.get_oracle_sequence_from_state( - state.copy(), gold) - kept.append((eg, state, gold, oracle_actions)) - min_length = min(min_length, len(oracle_actions)) - max_length_seen = max(max_length, len(oracle_actions)) + if len(eg.x) < max_length: + states.append(state) + golds.append(gold) + else: + oracle_actions = self.moves.get_oracle_sequence_from_state( + state.copy(), gold) + kept.append((eg, state, gold, oracle_actions)) + min_length = min(min_length, len(oracle_actions)) + max_length_seen = max(max_length, len(oracle_actions)) if not kept: - return [], [], 0 + return states, golds, 0 max_length = max(min_length, min(max_length, max_length_seen)) - states = [] - golds = [] cdef int clas max_moves = 0 for eg, state, gold, oracle_actions in kept: diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 86d9a0180..496ec7e03 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): assert contains_cycle(tree) is None - assert contains_cycle(cyclic_tree) == set([3, 4, 5]) + assert contains_cycle(cyclic_tree) == {3, 4, 5} assert contains_cycle(partial_tree) is None assert contains_cycle(multirooted_tree) is None diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 4cff31712..5b9a1cd8e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -198,10 +198,10 @@ def test_overfitting_IO(): nlp.add_pipe(parser) optimizer = nlp.begin_training() - for i in range(50): + for i in range(100): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - assert losses["parser"] < 0.00001 + assert losses["parser"] < 0.0001 # test the trained model test_text = "I like securities." diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index bfa1bd65a..82f536076 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -38,6 +38,11 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # add some cases where SENT_START == -1 + train_examples[0].reference[10].is_sent_start = False + train_examples[1].reference[1].is_sent_start = False + train_examples[1].reference[11].is_sent_start = False + nlp.add_pipe(senter) optimizer = nlp.begin_training() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 07d73eb6e..a39b5075b 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -84,7 +84,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() - textcat = nlp.create_pipe("textcat") + textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True}) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 67966f70e..8b998d216 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -23,6 +23,7 @@ def test_issue2070(): assert len(doc) == 11 +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() @@ -134,6 +135,7 @@ def test_issue2464(en_vocab): assert len(matches) == 3 +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 5d504a9c6..768ae33fe 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls): assert doc[0].like_num +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2800(): """Test issue that arises when too many labels are added to NER model. Used to cause segfault. """ nlp = English() train_data = [] - train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]) + train_data.extend( + [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] + ) entity_types = [str(i) for i in range(1000)] ner = nlp.create_pipe("ner") nlp.add_pipe(ner) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 1aceba68f..1d5bfcb92 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -88,6 +88,7 @@ def test_issue3199(): assert list(doc[0:3].noun_chunks) == [] +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py new file mode 100644 index 000000000..5e2ee902c --- /dev/null +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -0,0 +1,472 @@ +import pytest +from spacy.language import Language +from spacy.vocab import Vocab +from spacy.pipeline import EntityRuler, DependencyParser +from spacy.pipeline.defaults import default_parser +from spacy import displacy, load +from spacy.displacy import parse_deps +from spacy.tokens import Doc, Token +from spacy.matcher import Matcher, PhraseMatcher +from spacy.errors import MatchPatternError +from spacy.util import minibatch +from spacy.gold import Example +from spacy.lang.hi import Hindi +from spacy.lang.es import Spanish +from spacy.lang.en import English +from spacy.attrs import IS_ALPHA +from thinc.api import compounding +import spacy +import srsly +import numpy + +from ..util import make_tempdir, get_doc + + +@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms + assert tok.is_stop + + +def test_issue_3526_1(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler_bytes = ruler.to_bytes() + assert len(ruler) == len(patterns) + assert len(ruler.labels) == 4 + assert ruler.overwrite + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(ruler_bytes) + assert len(new_ruler) == len(ruler) + assert len(new_ruler.labels) == 4 + assert new_ruler.overwrite == ruler.overwrite + assert new_ruler.ent_id_sep == ruler.ent_id_sep + + +def test_issue_3526_2(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + bytes_old_style = srsly.msgpack_dumps(ruler.patterns) + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(bytes_old_style) + assert len(new_ruler) == len(ruler) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert new_ruler.overwrite is not ruler.overwrite + + +def test_issue_3526_3(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + with make_tempdir() as tmpdir: + out_file = tmpdir / "entity_ruler" + srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) + new_ruler = EntityRuler(nlp).from_disk(out_file) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert len(new_ruler) == len(ruler) + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue_3526_4(en_vocab): + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, overwrite_ents=True) + ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) + nlp.add_pipe(ruler) + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + ruler = nlp.get_pipe("entity_ruler") + assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert ruler.overwrite is True + nlp2 = load(tmpdir) + new_ruler = nlp2.get_pipe("entity_ruler") + assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert new_ruler.overwrite is True + + +def test_issue3531(): + """Test that displaCy renderer doesn't require "settings" key.""" + example_dep = { + "words": [ + {"text": "But", "tag": "CCONJ"}, + {"text": "Google", "tag": "PROPN"}, + {"text": "is", "tag": "VERB"}, + {"text": "starting", "tag": "VERB"}, + {"text": "from", "tag": "ADP"}, + {"text": "behind.", "tag": "ADV"}, + ], + "arcs": [ + {"start": 0, "end": 3, "label": "cc", "dir": "left"}, + {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "aux", "dir": "left"}, + {"start": 3, "end": 4, "label": "prep", "dir": "right"}, + {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, + ], + } + example_ent = { + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG"}], + } + dep_html = displacy.render(example_dep, style="dep", manual=True) + assert dep_html + ent_html = displacy.render(example_ent, style="ent", manual=True) + assert ent_html + + +def test_issue3540(en_vocab): + words = ["I", "live", "in", "NewYork", "right", "now"] + tensor = numpy.asarray( + [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], + dtype="f", + ) + doc = Doc(en_vocab, words=words) + doc.tensor = tensor + gold_text = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_1 = [token.vector for token in doc] + assert len(vectors_1) == len(doc) + + with doc.retokenize() as retokenizer: + heads = [(doc[3], 1), doc[2]] + attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} + retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) + + gold_text = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_2 = [token.vector for token in doc] + assert len(vectors_2) == len(doc) + assert vectors_1[0].tolist() == vectors_2[0].tolist() + assert vectors_1[1].tolist() == vectors_2[1].tolist() + assert vectors_1[2].tolist() == vectors_2[2].tolist() + assert vectors_1[4].tolist() == vectors_2[5].tolist() + assert vectors_1[5].tolist() == vectors_2[6].tolist() + + +def test_issue3549(en_vocab): + """Test that match pattern validation doesn't raise on empty errors.""" + matcher = Matcher(en_vocab, validate=True) + pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] + matcher.add("GOOD", [pattern]) + with pytest.raises(MatchPatternError): + matcher.add("BAD", [[{"X": "Y"}]]) + + +@pytest.mark.xfail +def test_issue3555(en_vocab): + """Test that custom extensions with default None don't break matcher.""" + Token.set_extension("issue3555", default=None) + matcher = Matcher(en_vocab) + pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["have", "apple"]) + matcher(doc) + + +def test_issue3611(): + """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + textcat = nlp.create_pipe( + "textcat", + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, + ) + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.begin_training(X=x_train, Y=y_train) + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update( + examples=batch, sgd=optimizer, drop=0.1, losses=losses, + ) + + +def test_issue3625(): + """Test that default punctuation rules applies to hindi unicode characters""" + nlp = Hindi() + doc = nlp("hi. how हुए. होटल, होटल") + expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] + assert [token.text for token in doc] == expected + + +def test_issue3803(): + """Test that spanish num-like tokens have True for like_num attribute.""" + nlp = Spanish() + text = "2 dos 1000 mil 12 doce" + doc = nlp(text) + + assert [t.like_num for t in doc] == [True, True, True, True, True, True] + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3830_no_subtok(): + """Test that the parser doesn't have subtok label if not learn_tokens""" + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + parser = DependencyParser(Vocab(), default_parser(), **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.begin_training(lambda: []) + assert "subtok" not in parser.labels + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3830_with_subtok(): + """Test that the parser does have subtok label if learn_tokens=True.""" + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + parser = DependencyParser(Vocab(), default_parser(), **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.begin_training(lambda: []) + assert "subtok" in parser.labels + + +def test_issue3839(en_vocab): + """Test that match IDs returned by the matcher are correct, are in the string """ + doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) + matcher = Matcher(en_vocab) + match_id = "PATTERN" + pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] + pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] + matcher.add(match_id, [pattern1]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + matcher = Matcher(en_vocab) + matcher.add(match_id, [pattern2]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + + +@pytest.mark.parametrize( + "sentence", + [ + "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", + "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", + "It was a missed assignment, but it shouldn't have resulted in a turnover ...", + ], +) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + doc = nlp(sentence) + count = 0 + for token in doc: + count += token.is_alpha + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + +def test_issue3879(en_vocab): + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + assert len(doc) == 5 + pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] + matcher = Matcher(en_vocab) + matcher.add("TEST", [pattern]) + assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3880(): + """Test that `nlp.pipe()` works when an empty string ends the batch. + + Fixed in v7.0.5 of Thinc. + """ + texts = ["hello", "world", "", ""] + nlp = English() + nlp.add_pipe(nlp.create_pipe("parser")) + nlp.add_pipe(nlp.create_pipe("ner")) + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.get_pipe("parser").add_label("dep") + nlp.get_pipe("ner").add_label("PERSON") + nlp.get_pipe("tagger").add_label("NN") + nlp.begin_training() + for doc in nlp.pipe(texts): + pass + + +def test_issue3882(en_vocab): + """Test that displaCy doesn't serialize the doc.user_data when making a + copy of the Doc. + """ + doc = Doc(en_vocab, words=["Hello", "world"]) + doc.is_parsed = True + doc.user_data["test"] = set() + parse_deps(doc) + + +def test_issue3951(en_vocab): + """Test that combinations of optional rules are matched correctly.""" + matcher = Matcher(en_vocab) + pattern = [ + {"LOWER": "hello"}, + {"LOWER": "this", "OP": "?"}, + {"OP": "?"}, + {"LOWER": "world"}, + ] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) + matches = matcher(doc) + assert len(matches) == 0 + + +def test_issue3959(): + """ Ensure that a modified pos attribute is serialized correctly.""" + nlp = English() + doc = nlp( + "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" + ) + assert doc[0].pos_ == "" + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + with make_tempdir() as tmp_dir: + file_path = tmp_dir / "my_doc" + doc.to_disk(file_path) + doc2 = nlp("") + doc2.from_disk(file_path) + assert doc2[0].pos_ == "NOUN" + + +def test_issue3962(en_vocab): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] + heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] + deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = doc[1:5] # "jests at scars ," + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "dep" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" # head set to the new artificial root + assert doc2[3].dep_ == "dep" + # We should still have 1 sentence + assert len(list(doc2.sents)) == 1 + span3 = doc[6:9] # "never felt a" + doc3 = span3.as_doc() + doc3_json = doc3.to_json() + assert doc3_json + assert doc3[0].head.text == "felt" + assert doc3[0].dep_ == "neg" + assert doc3[1].head.text == "felt" + assert doc3[1].dep_ == "ROOT" + assert doc3[2].head.text == "felt" # head set to ancestor + assert doc3[2].dep_ == "dep" + # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" + assert len(list(doc3.sents)) == 1 + + +def test_issue3962_long(en_vocab): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] + heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] + deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = two_sent_doc[1:7] # "jests at scars. They never" + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root (in sentence 1) + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "ROOT" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" + assert doc2[3].dep_ == "punct" + # head set to itself, being the new artificial root (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # head set to the new artificial head (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # We should still have 2 sentences + sents = list(doc2.sents) + assert len(sents) == 2 + assert sents[0].text == "jests at scars ." + assert sents[1].text == "They never" + + +def test_issue3972(en_vocab): + """Test that the PhraseMatcher returns duplicates for duplicate match IDs. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) + matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) + doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) + matches = matcher(doc) + + assert len(matches) == 2 + + # We should have a match for each of the two rules + found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] + assert "A" in found_ids + assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py deleted file mode 100644 index 3d8ee9922..000000000 --- a/spacy/tests/regression/test_issue3521.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - - -@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) -def test_issue3521(en_tokenizer, word): - tok = en_tokenizer(word)[1] - # 'not' and 'would' should be stopwords, also in their abbreviated forms - assert tok.is_stop diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py deleted file mode 100644 index aa77028fb..000000000 --- a/spacy/tests/regression/test_issue3526.py +++ /dev/null @@ -1,85 +0,0 @@ -import pytest -from spacy.tokens import Span -from spacy.language import Language -from spacy.pipeline import EntityRuler -from spacy import load -import srsly - -from ..util import make_tempdir - - -@pytest.fixture -def patterns(): - return [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - - -@pytest.fixture -def add_ent(): - def add_ent_component(doc): - doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])] - return doc - - return add_ent_component - - -def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - ruler_bytes = ruler.to_bytes() - assert len(ruler) == len(patterns) - assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(ruler_bytes) - assert len(new_ruler) == len(ruler) - assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite - - -def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, overwrite_ents=True) - - ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) - nlp.add_pipe(ruler) - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - ruler = nlp.get_pipe("entity_ruler") - assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True - nlp2 = load(tmpdir) - new_ruler = nlp2.get_pipe("entity_ruler") - assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py deleted file mode 100644 index 4c65a5bfe..000000000 --- a/spacy/tests/regression/test_issue3531.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy import displacy - - -def test_issue3531(): - """Test that displaCy renderer doesn't require "settings" key.""" - example_dep = { - "words": [ - {"text": "But", "tag": "CCONJ"}, - {"text": "Google", "tag": "PROPN"}, - {"text": "is", "tag": "VERB"}, - {"text": "starting", "tag": "VERB"}, - {"text": "from", "tag": "ADP"}, - {"text": "behind.", "tag": "ADV"}, - ], - "arcs": [ - {"start": 0, "end": 3, "label": "cc", "dir": "left"}, - {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, - {"start": 2, "end": 3, "label": "aux", "dir": "left"}, - {"start": 3, "end": 4, "label": "prep", "dir": "right"}, - {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, - ], - } - example_ent = { - "text": "But Google is starting from behind.", - "ents": [{"start": 4, "end": 10, "label": "ORG"}], - } - dep_html = displacy.render(example_dep, style="dep", manual=True) - assert dep_html - ent_html = displacy.render(example_ent, style="ent", manual=True) - assert ent_html diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py deleted file mode 100644 index be9e04b0b..000000000 --- a/spacy/tests/regression/test_issue3540.py +++ /dev/null @@ -1,44 +0,0 @@ -from spacy.tokens import Doc - -import numpy as np - - -def test_issue3540(en_vocab): - - words = ["I", "live", "in", "NewYork", "right", "now"] - tensor = np.asarray( - [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], - dtype="f", - ) - doc = Doc(en_vocab, words=words) - doc.tensor = tensor - - gold_text = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.text for token in doc] == gold_text - - gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - - vectors_1 = [token.vector for token in doc] - assert len(vectors_1) == len(doc) - - with doc.retokenize() as retokenizer: - heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} - retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) - - gold_text = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.text for token in doc] == gold_text - - gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - - vectors_2 = [token.vector for token in doc] - assert len(vectors_2) == len(doc) - - assert vectors_1[0].tolist() == vectors_2[0].tolist() - assert vectors_1[1].tolist() == vectors_2[1].tolist() - assert vectors_1[2].tolist() == vectors_2[2].tolist() - - assert vectors_1[4].tolist() == vectors_2[5].tolist() - assert vectors_1[5].tolist() == vectors_2[6].tolist() diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py deleted file mode 100644 index b3af59c2e..000000000 --- a/spacy/tests/regression/test_issue3549.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest -from spacy.matcher import Matcher -from spacy.errors import MatchPatternError - - -def test_issue3549(en_vocab): - """Test that match pattern validation doesn't raise on empty errors.""" - matcher = Matcher(en_vocab, validate=True) - pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] - matcher.add("GOOD", [pattern]) - with pytest.raises(MatchPatternError): - matcher.add("BAD", [[{"X": "Y"}]]) diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py deleted file mode 100644 index de047bcbc..000000000 --- a/spacy/tests/regression/test_issue3555.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest -from spacy.tokens import Doc, Token -from spacy.matcher import Matcher - - -@pytest.mark.xfail -def test_issue3555(en_vocab): - """Test that custom extensions with default None don't break matcher.""" - Token.set_extension("issue3555", default=None) - matcher = Matcher(en_vocab) - pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["have", "apple"]) - matcher(doc) diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py deleted file mode 100644 index ef189c446..000000000 --- a/spacy/tests/regression/test_issue3611.py +++ /dev/null @@ -1,45 +0,0 @@ -import spacy -from spacy.util import minibatch -from thinc.api import compounding -from spacy.gold import Example - - -def test_issue3611(): - """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - - nlp = spacy.blank("en") - - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - - # add a text categorizer component - textcat = nlp.create_pipe( - "textcat", - config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, - ) - - for label in unique_classes: - textcat.add_label(label) - nlp.add_pipe(textcat, last=True) - - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training(X=x_train, Y=y_train) - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py deleted file mode 100644 index 51561b3ac..000000000 --- a/spacy/tests/regression/test_issue3625.py +++ /dev/null @@ -1,9 +0,0 @@ -from spacy.lang.hi import Hindi - - -def test_issue3625(): - """Test that default punctuation rules applies to hindi unicode characters""" - nlp = Hindi() - doc = nlp("hi. how हुए. होटल, होटल") - expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] - assert [token.text for token in doc] == expected diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py deleted file mode 100644 index ab5250edf..000000000 --- a/spacy/tests/regression/test_issue3803.py +++ /dev/null @@ -1,10 +0,0 @@ -from spacy.lang.es import Spanish - - -def test_issue3803(): - """Test that spanish num-like tokens have True for like_num attribute.""" - nlp = Spanish() - text = "2 dos 1000 mil 12 doce" - doc = nlp(text) - - assert [t.like_num for t in doc] == [True, True, True, True, True, True] diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py deleted file mode 100644 index 06b7893a7..000000000 --- a/spacy/tests/regression/test_issue3830.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.pipeline.pipes import DependencyParser -from spacy.vocab import Vocab - -from spacy.pipeline.defaults import default_parser - - -def test_issue3830_no_subtok(): - """Test that the parser doesn't have subtok label if not learn_tokens""" - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - parser = DependencyParser(Vocab(), default_parser(), **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.begin_training(lambda: []) - assert "subtok" not in parser.labels - - -def test_issue3830_with_subtok(): - """Test that the parser does have subtok label if learn_tokens=True.""" - config = { - "learn_tokens": True, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - parser = DependencyParser(Vocab(), default_parser(), **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.begin_training(lambda: []) - assert "subtok" in parser.labels diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py deleted file mode 100644 index 27b1f5f29..000000000 --- a/spacy/tests/regression/test_issue3839.py +++ /dev/null @@ -1,18 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string """ - doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) - matcher = Matcher(en_vocab) - match_id = "PATTERN" - pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] - pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] - matcher.add(match_id, [pattern1]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - matcher = Matcher(en_vocab) - matcher.add(match_id, [pattern2]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py deleted file mode 100644 index 0a851e869..000000000 --- a/spacy/tests/regression/test_issue3869.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -from spacy.attrs import IS_ALPHA -from spacy.lang.en import English - - -@pytest.mark.parametrize( - "sentence", - [ - "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", - "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", - "It was a missed assignment, but it shouldn't have resulted in a turnover ...", - ], -) -def test_issue3869(sentence): - """Test that the Doc's count_by function works consistently""" - nlp = English() - doc = nlp(sentence) - - count = 0 - for token in doc: - count += token.is_alpha - - assert count == doc.count_by(IS_ALPHA).get(1, 0) diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py deleted file mode 100644 index 8500c09aa..000000000 --- a/spacy/tests/regression/test_issue3879.py +++ /dev/null @@ -1,11 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3879(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) - assert len(doc) == 5 - pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] - matcher = Matcher(en_vocab) - matcher.add("TEST", [pattern]) - assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py deleted file mode 100644 index 6e8ab6f43..000000000 --- a/spacy/tests/regression/test_issue3880.py +++ /dev/null @@ -1,21 +0,0 @@ -from spacy.lang.en import English -import pytest - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_issue3880(): - """Test that `nlp.pipe()` works when an empty string ends the batch. - - Fixed in v7.0.5 of Thinc. - """ - texts = ["hello", "world", "", ""] - nlp = English() - nlp.add_pipe(nlp.create_pipe("parser")) - nlp.add_pipe(nlp.create_pipe("ner")) - nlp.add_pipe(nlp.create_pipe("tagger")) - nlp.get_pipe("parser").add_label("dep") - nlp.get_pipe("ner").add_label("PERSON") - nlp.get_pipe("tagger").add_label("NN") - nlp.begin_training() - for doc in nlp.pipe(texts): - pass diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py deleted file mode 100644 index fa616db1d..000000000 --- a/spacy/tests/regression/test_issue3882.py +++ /dev/null @@ -1,12 +0,0 @@ -from spacy.displacy import parse_deps -from spacy.tokens import Doc - - -def test_issue3882(en_vocab): - """Test that displaCy doesn't serialize the doc.user_data when making a - copy of the Doc. - """ - doc = Doc(en_vocab, words=["Hello", "world"]) - doc.is_parsed = True - doc.user_data["test"] = set() - parse_deps(doc) diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py deleted file mode 100644 index 6e4c9eeaa..000000000 --- a/spacy/tests/regression/test_issue3951.py +++ /dev/null @@ -1,17 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3951(en_vocab): - """Test that combinations of optional rules are matched correctly.""" - matcher = Matcher(en_vocab) - pattern = [ - {"LOWER": "hello"}, - {"LOWER": "this", "OP": "?"}, - {"OP": "?"}, - {"LOWER": "world"}, - ] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) - matches = matcher(doc) - assert len(matches) == 0 diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py deleted file mode 100644 index 7db28a31f..000000000 --- a/spacy/tests/regression/test_issue3959.py +++ /dev/null @@ -1,26 +0,0 @@ -from spacy.lang.en import English -from ..util import make_tempdir - - -def test_issue3959(): - """ Ensure that a modified pos attribute is serialized correctly.""" - nlp = English() - doc = nlp( - "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" - ) - assert doc[0].pos_ == "" - - doc[0].pos_ = "NOUN" - assert doc[0].pos_ == "NOUN" - - # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True - - with make_tempdir() as tmp_dir: - file_path = tmp_dir / "my_doc" - doc.to_disk(file_path) - - doc2 = nlp("") - doc2.from_disk(file_path) - - assert doc2[0].pos_ == "NOUN" diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py deleted file mode 100644 index 971c9b08e..000000000 --- a/spacy/tests/regression/test_issue3962.py +++ /dev/null @@ -1,117 +0,0 @@ -import pytest - -from ..util import get_doc - - -@pytest.fixture -def doc(en_tokenizer): - text = "He jests at scars, that never felt a wound." - heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] - deps = [ - "nsubj", - "ccomp", - "prep", - "pobj", - "punct", - "nsubj", - "neg", - "ROOT", - "det", - "dobj", - "punct", - ] - tokens = en_tokenizer(text) - return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - - -def test_issue3962(doc): - """ Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - span2 = doc[1:5] # "jests at scars ," - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - - assert ( - doc2[0].head.text == "jests" - ) # head set to itself, being the new artificial root - assert doc2[0].dep_ == "dep" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" # head set to the new artificial root - assert doc2[3].dep_ == "dep" - - # We should still have 1 sentence - assert len(list(doc2.sents)) == 1 - - span3 = doc[6:9] # "never felt a" - doc3 = span3.as_doc() - doc3_json = doc3.to_json() - assert doc3_json - - assert doc3[0].head.text == "felt" - assert doc3[0].dep_ == "neg" - assert doc3[1].head.text == "felt" - assert doc3[1].dep_ == "ROOT" - assert doc3[2].head.text == "felt" # head set to ancestor - assert doc3[2].dep_ == "dep" - - # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" - assert len(list(doc3.sents)) == 1 - - -@pytest.fixture -def two_sent_doc(en_tokenizer): - text = "He jests at scars. They never felt a wound." - heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] - deps = [ - "nsubj", - "ROOT", - "prep", - "pobj", - "punct", - "nsubj", - "neg", - "ROOT", - "det", - "dobj", - "punct", - ] - tokens = en_tokenizer(text) - return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - - -def test_issue3962_long(two_sent_doc): - """ Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - span2 = two_sent_doc[1:7] # "jests at scars. They never" - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - - assert ( - doc2[0].head.text == "jests" - ) # head set to itself, being the new artificial root (in sentence 1) - assert doc2[0].dep_ == "ROOT" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" - assert doc2[3].dep_ == "punct" - assert ( - doc2[4].head.text == "They" - ) # head set to itself, being the new artificial root (in sentence 2) - assert doc2[4].dep_ == "dep" - assert ( - doc2[4].head.text == "They" - ) # head set to the new artificial head (in sentence 2) - assert doc2[4].dep_ == "dep" - - # We should still have 2 sentences - sents = list(doc2.sents) - assert len(sents) == 2 - assert sents[0].text == "jests at scars ." - assert sents[1].text == "They never" diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py deleted file mode 100644 index fe5388950..000000000 --- a/spacy/tests/regression/test_issue3972.py +++ /dev/null @@ -1,19 +0,0 @@ -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -def test_issue3972(en_vocab): - """Test that the PhraseMatcher returns duplicates for duplicate match IDs. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) - matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) - doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) - matches = matcher(doc) - - assert len(matches) == 2 - - # We should have a match for each of the two rules - found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] - assert "A" in found_ids - assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py new file mode 100644 index 000000000..626856e9e --- /dev/null +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -0,0 +1,469 @@ +import pytest +from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe +from spacy.pipeline.defaults import default_ner +from spacy.matcher import PhraseMatcher, Matcher +from spacy.tokens import Doc, Span, DocBin +from spacy.gold import Example, Corpus +from spacy.gold.converters import json2docs +from spacy.vocab import Vocab +from spacy.lang.en import English +from spacy.util import minibatch, ensure_path, load_model +from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex +from spacy.tokenizer import Tokenizer +from spacy.lang.el import Greek +from spacy.language import Language +import spacy +from thinc.api import compounding +from collections import defaultdict + +from ..util import make_tempdir + + +def test_issue4002(en_vocab): + """Test that the PhraseMatcher can match on overwritten NORM attributes. + """ + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern1 = Doc(en_vocab, words=["c", "d"]) + assert [t.norm_ for t in pattern1] == ["c", "d"] + matcher.add("TEST", [pattern1]) + doc = Doc(en_vocab, words=["a", "b", "c", "d"]) + assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] + matches = matcher(doc) + assert len(matches) == 1 + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern2 = Doc(en_vocab, words=["1", "2"]) + pattern2[0].norm_ = "c" + pattern2[1].norm_ = "d" + assert [t.norm_ for t in pattern2] == ["c", "d"] + matcher.add("TEST", [pattern2]) + matches = matcher(doc) + assert len(matches) == 1 + + +def test_issue4030(): + """ Test whether textcat works fine with empty doc """ + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + textcat = nlp.create_pipe( + "textcat", + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, + ) + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.begin_training() + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update( + examples=batch, sgd=optimizer, drop=0.1, losses=losses, + ) + # processing of an empty doc should result in 0.0 for all categories + doc = nlp("") + assert doc.cats["offensive"] == 0.0 + assert doc.cats["inoffensive"] == 0.0 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4042(): + """Test that serialization of an EntityRuler before NER works fine.""" + nlp = English() + + # add ner pipe + ner = nlp.create_pipe("ner") + ner.add_label("SOME_LABEL") + nlp.add_pipe(ner) + nlp.begin_training() + + # Add entity ruler + ruler = EntityRuler(nlp) + patterns = [ + {"label": "MY_ORG", "pattern": "Apple"}, + {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, + ] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler, before="ner") # works fine with "after" + doc1 = nlp("What do you think about Apple ?") + assert doc1.ents[0].label_ == "MY_ORG" + + with make_tempdir() as d: + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + + nlp2 = load_model(output_dir) + doc2 = nlp2("What do you think about Apple ?") + assert doc2.ents[0].label_ == "MY_ORG" + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4042_bug2(): + """ + Test that serialization of an NER works fine when new labels were added. + This is the second bug of two bugs underlying the issue 4042. + """ + nlp1 = English() + vocab = nlp1.vocab + + # add ner pipe + ner1 = nlp1.create_pipe("ner") + ner1.add_label("SOME_LABEL") + nlp1.add_pipe(ner1) + nlp1.begin_training() + + # add a new label to the doc + doc1 = nlp1("What do you think about Apple ?") + assert len(ner1.labels) == 1 + assert "SOME_LABEL" in ner1.labels + apple_ent = Span(doc1, 5, 6, label="MY_ORG") + doc1.ents = list(doc1.ents) + [apple_ent] + + # reapply the NER - at this point it should resize itself + ner1(doc1) + assert len(ner1.labels) == 2 + assert "SOME_LABEL" in ner1.labels + assert "MY_ORG" in ner1.labels + + with make_tempdir() as d: + # assert IO goes fine + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + ner1.to_disk(output_dir) + + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + ner2 = EntityRecognizer(vocab, default_ner(), **config) + ner2.from_disk(output_dir) + assert len(ner2.labels) == 2 + + +def test_issue4054(en_vocab): + """Test that a new blank model can be made with a vocab from file, + and that serialization does not drop the language at any point.""" + nlp1 = English() + vocab1 = nlp1.vocab + with make_tempdir() as d: + vocab_dir = ensure_path(d / "vocab") + if not vocab_dir.exists(): + vocab_dir.mkdir() + vocab1.to_disk(vocab_dir) + vocab2 = Vocab().from_disk(vocab_dir) + print("lang", vocab2.lang) + nlp2 = spacy.blank("en", vocab=vocab2) + nlp_dir = ensure_path(d / "nlp") + if not nlp_dir.exists(): + nlp_dir.mkdir() + nlp2.to_disk(nlp_dir) + nlp3 = load_model(nlp_dir) + assert nlp3.lang == "en" + + +def test_issue4120(en_vocab): + """Test that matches without a final {OP: ?} token are returned.""" + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) + doc1 = Doc(en_vocab, words=["a"]) + assert len(matcher(doc1)) == 1 # works + doc2 = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc2)) == 2 # fixed + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) + doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc3)) == 2 # works + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) + doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc4)) == 3 # fixed + + +def test_issue4133(en_vocab): + nlp = English() + vocab_bytes = nlp.vocab.to_bytes() + words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] + pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] + doc = Doc(en_vocab, words=words) + for i, token in enumerate(doc): + token.pos_ = pos[i] + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + doc_bytes = doc.to_bytes() + vocab = Vocab() + vocab = vocab.from_bytes(vocab_bytes) + doc = Doc(vocab).from_bytes(doc_bytes) + actual = [] + for token in doc: + actual.append(token.pos_) + assert actual == pos + + +def test_issue4190(): + def customize_tokenizer(nlp): + prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) + suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) + infix_re = compile_infix_regex(nlp.Defaults.infixes) + # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') + exceptions = { + k: v + for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() + if not (len(k) == 2 and k[1] == ".") + } + new_tokenizer = Tokenizer( + nlp.vocab, + exceptions, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=nlp.tokenizer.token_match, + ) + nlp.tokenizer = new_tokenizer + + test_string = "Test c." + # Load default language + nlp_1 = English() + doc_1a = nlp_1(test_string) + result_1a = [token.text for token in doc_1a] # noqa: F841 + # Modify tokenizer + customize_tokenizer(nlp_1) + doc_1b = nlp_1(test_string) + result_1b = [token.text for token in doc_1b] + # Save and Reload + with make_tempdir() as model_dir: + nlp_1.to_disk(model_dir) + nlp_2 = load_model(model_dir) + # This should be the modified tokenizer + doc_2 = nlp_2(test_string) + result_2 = [token.text for token in doc_2] + assert result_1b == result_2 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4267(): + """ Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PEOPLE") + nlp.add_pipe(ner) + nlp.begin_training() + assert "ner" in nlp.pipe_names + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.is_nered + for token in doc1: + assert token.ent_iob == 2 + # add entity ruler and run again + ruler = EntityRuler(nlp) + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.is_nered + for token in doc2: + assert token.ent_iob == 2 + + +def test_issue4272(): + """Test that lookup table can be accessed from Token.lemma if no POS tags + are available.""" + nlp = Greek() + doc = nlp("Χθες") + assert doc[0].lemma_ + + +def test_multiple_predictions(): + class DummyPipe(Pipe): + def __init__(self): + self.model = "dummy_model" + + def predict(self, docs): + return ([1, 2, 3], [4, 5, 6]) + + def set_annotations(self, docs, scores): + return docs + + nlp = Language() + doc = nlp.make_doc("foo") + dummy_pipe = DummyPipe() + dummy_pipe(doc) + + +@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor") +def test_issue4313(): + """ This should not crash or exit with some strange error code """ + beam_width = 16 + beam_density = 0.0001 + nlp = English() + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + ner = EntityRecognizer(nlp.vocab, default_ner(), **config) + ner.add_label("SOME_LABEL") + ner.begin_training([]) + nlp.add_pipe(ner) + + # add a new label to the doc + doc = nlp("What do you think about Apple ?") + assert len(ner.labels) == 1 + assert "SOME_LABEL" in ner.labels + apple_ent = Span(doc, 5, 6, label="MY_ORG") + doc.ents = list(doc.ents) + [apple_ent] + + # ensure the beam_parse still works with the new label + docs = [doc] + beams = nlp.entity.beam_parse( + docs, beam_width=beam_width, beam_density=beam_density + ) + + for doc, beam in zip(docs, beams): + entity_scores = defaultdict(float) + for score, ents in nlp.entity.moves.get_beam_parses(beam): + for start, end, label in ents: + entity_scores[(start, end, label)] += score + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4348(): + """Test that training the tagger with empty data, doesn't throw errors""" + nlp = English() + example = Example.from_dict(nlp.make_doc(""), {"tags": []}) + TRAIN_DATA = [example, example] + tagger = nlp.create_pipe("tagger") + nlp.add_pipe(tagger) + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + +def test_issue4367(): + """Test that docbin init goes well""" + DocBin() + DocBin(attrs=["LEMMA"]) + DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) + + +def test_issue4373(): + """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" + matcher = Matcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + matcher = PhraseMatcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + + +def test_issue4402(): + json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], + } + nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] + with make_tempdir() as tmpdir: + output_file = tmpdir / "test4402.spacy" + docs = json2docs([json_data]) + data = DocBin(docs=docs, attrs=attrs).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) + + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 + + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py deleted file mode 100644 index 3ac26d3ab..000000000 --- a/spacy/tests/regression/test_issue4002.py +++ /dev/null @@ -1,23 +0,0 @@ -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -def test_issue4002(en_vocab): - """Test that the PhraseMatcher can match on overwritten NORM attributes. - """ - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern1 = Doc(en_vocab, words=["c", "d"]) - assert [t.norm_ for t in pattern1] == ["c", "d"] - matcher.add("TEST", [pattern1]) - doc = Doc(en_vocab, words=["a", "b", "c", "d"]) - assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] - matches = matcher(doc) - assert len(matches) == 1 - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern2 = Doc(en_vocab, words=["1", "2"]) - pattern2[0].norm_ = "c" - pattern2[1].norm_ = "d" - assert [t.norm_ for t in pattern2] == ["c", "d"] - matcher.add("TEST", [pattern2]) - matches = matcher(doc) - assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py deleted file mode 100644 index e40565501..000000000 --- a/spacy/tests/regression/test_issue4030.py +++ /dev/null @@ -1,50 +0,0 @@ -import spacy -from spacy.util import minibatch -from thinc.api import compounding -from spacy.gold import Example - - -def test_issue4030(): - """ Test whether textcat works fine with empty doc """ - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - - nlp = spacy.blank("en") - - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - - # add a text categorizer component - textcat = nlp.create_pipe( - "textcat", - config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, - ) - - for label in unique_classes: - textcat.add_label(label) - nlp.add_pipe(textcat, last=True) - - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) - - # processing of an empty doc should result in 0.0 for all categories - doc = nlp("") - assert doc.cats["offensive"] == 0.0 - assert doc.cats["inoffensive"] == 0.0 diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py deleted file mode 100644 index f47290b92..000000000 --- a/spacy/tests/regression/test_issue4042.py +++ /dev/null @@ -1,85 +0,0 @@ -import spacy -from spacy.pipeline import EntityRecognizer, EntityRuler -from spacy.lang.en import English -from spacy.tokens import Span -from spacy.util import ensure_path -from spacy.pipeline.defaults import default_ner - -from ..util import make_tempdir - - -def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" - nlp = English() - - # add ner pipe - ner = nlp.create_pipe("ner") - ner.add_label("SOME_LABEL") - nlp.add_pipe(ner) - nlp.begin_training() - - # Add entity ruler - ruler = EntityRuler(nlp) - patterns = [ - {"label": "MY_ORG", "pattern": "Apple"}, - {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, - ] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler, before="ner") # works fine with "after" - doc1 = nlp("What do you think about Apple ?") - assert doc1.ents[0].label_ == "MY_ORG" - - with make_tempdir() as d: - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - - nlp2 = spacy.load(output_dir) - doc2 = nlp2("What do you think about Apple ?") - assert doc2.ents[0].label_ == "MY_ORG" - - -def test_issue4042_bug2(): - """ - Test that serialization of an NER works fine when new labels were added. - This is the second bug of two bugs underlying the issue 4042. - """ - nlp1 = English() - vocab = nlp1.vocab - - # add ner pipe - ner1 = nlp1.create_pipe("ner") - ner1.add_label("SOME_LABEL") - nlp1.add_pipe(ner1) - nlp1.begin_training() - - # add a new label to the doc - doc1 = nlp1("What do you think about Apple ?") - assert len(ner1.labels) == 1 - assert "SOME_LABEL" in ner1.labels - apple_ent = Span(doc1, 5, 6, label="MY_ORG") - doc1.ents = list(doc1.ents) + [apple_ent] - - # reapply the NER - at this point it should resize itself - ner1(doc1) - assert len(ner1.labels) == 2 - assert "SOME_LABEL" in ner1.labels - assert "MY_ORG" in ner1.labels - - with make_tempdir() as d: - # assert IO goes fine - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - ner1.to_disk(output_dir) - - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - ner2 = EntityRecognizer(vocab, default_ner(), **config) - ner2.from_disk(output_dir) - assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py deleted file mode 100644 index c52ded395..000000000 --- a/spacy/tests/regression/test_issue4054.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy.vocab import Vocab -import spacy -from spacy.lang.en import English -from spacy.util import ensure_path - -from ..util import make_tempdir - - -def test_issue4054(en_vocab): - """Test that a new blank model can be made with a vocab from file, - and that serialization does not drop the language at any point.""" - nlp1 = English() - vocab1 = nlp1.vocab - - with make_tempdir() as d: - vocab_dir = ensure_path(d / "vocab") - if not vocab_dir.exists(): - vocab_dir.mkdir() - vocab1.to_disk(vocab_dir) - - vocab2 = Vocab().from_disk(vocab_dir) - print("lang", vocab2.lang) - nlp2 = spacy.blank("en", vocab=vocab2) - - nlp_dir = ensure_path(d / "nlp") - if not nlp_dir.exists(): - nlp_dir.mkdir() - nlp2.to_disk(nlp_dir) - nlp3 = spacy.load(nlp_dir) - assert nlp3.lang == "en" diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py deleted file mode 100644 index 4849aa238..000000000 --- a/spacy/tests/regression/test_issue4120.py +++ /dev/null @@ -1,23 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue4120(en_vocab): - """Test that matches without a final {OP: ?} token are returned.""" - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) - doc1 = Doc(en_vocab, words=["a"]) - assert len(matcher(doc1)) == 1 # works - - doc2 = Doc(en_vocab, words=["a", "b", "c"]) - assert len(matcher(doc2)) == 2 # fixed - - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) - doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc3)) == 2 # works - - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) - doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc4)) == 3 # fixed diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py deleted file mode 100644 index a726806d7..000000000 --- a/spacy/tests/regression/test_issue4133.py +++ /dev/null @@ -1,28 +0,0 @@ -from spacy.lang.en import English -from spacy.tokens import Doc -from spacy.vocab import Vocab - - -def test_issue4133(en_vocab): - nlp = English() - vocab_bytes = nlp.vocab.to_bytes() - words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] - pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] - doc = Doc(en_vocab, words=words) - for i, token in enumerate(doc): - token.pos_ = pos[i] - - # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True - - doc_bytes = doc.to_bytes() - - vocab = Vocab() - vocab = vocab.from_bytes(vocab_bytes) - doc = Doc(vocab).from_bytes(doc_bytes) - - actual = [] - for token in doc: - actual.append(token.pos_) - - assert actual == pos diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py deleted file mode 100644 index 97d532d2a..000000000 --- a/spacy/tests/regression/test_issue4190.py +++ /dev/null @@ -1,46 +0,0 @@ -from spacy.lang.en import English -from spacy.tokenizer import Tokenizer -from spacy import util - -from ..util import make_tempdir - - -def test_issue4190(): - test_string = "Test c." - # Load default language - nlp_1 = English() - doc_1a = nlp_1(test_string) - result_1a = [token.text for token in doc_1a] # noqa: F841 - # Modify tokenizer - customize_tokenizer(nlp_1) - doc_1b = nlp_1(test_string) - result_1b = [token.text for token in doc_1b] - # Save and Reload - with make_tempdir() as model_dir: - nlp_1.to_disk(model_dir) - nlp_2 = util.load_model(model_dir) - # This should be the modified tokenizer - doc_2 = nlp_2(test_string) - result_2 = [token.text for token in doc_2] - assert result_1b == result_2 - - -def customize_tokenizer(nlp): - prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes) - infix_re = util.compile_infix_regex(nlp.Defaults.infixes) - # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') - exceptions = { - k: v - for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() - if not (len(k) == 2 and k[1] == ".") - } - new_tokenizer = Tokenizer( - nlp.vocab, - exceptions, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=nlp.tokenizer.token_match, - ) - nlp.tokenizer = new_tokenizer diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py deleted file mode 100644 index 891f03b30..000000000 --- a/spacy/tests/regression/test_issue4267.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - - -def test_issue4267(): - """ Test that running an entity_ruler after ner gives consistent results""" - nlp = English() - ner = nlp.create_pipe("ner") - ner.add_label("PEOPLE") - nlp.add_pipe(ner) - nlp.begin_training() - - assert "ner" in nlp.pipe_names - - # assert that we have correct IOB annotations - doc1 = nlp("hi") - assert doc1.is_nered - for token in doc1: - assert token.ent_iob == 2 - - # add entity ruler and run again - ruler = EntityRuler(nlp) - patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] - - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - assert "entity_ruler" in nlp.pipe_names - assert "ner" in nlp.pipe_names - - # assert that we still have correct IOB annotations - doc2 = nlp("hi") - assert doc2.is_nered - for token in doc2: - assert token.ent_iob == 2 diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py deleted file mode 100644 index 4bac97a44..000000000 --- a/spacy/tests/regression/test_issue4272.py +++ /dev/null @@ -1,9 +0,0 @@ -from spacy.lang.el import Greek - - -def test_issue4272(): - """Test that lookup table can be accessed from Token.lemma if no POS tags - are available.""" - nlp = Greek() - doc = nlp("Χθες") - assert doc[0].lemma_ diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py deleted file mode 100644 index ffbc41226..000000000 --- a/spacy/tests/regression/test_issue4278.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -from spacy.language import Language -from spacy.pipeline import Pipe - - -class DummyPipe(Pipe): - def __init__(self): - self.model = "dummy_model" - - def predict(self, docs): - return ([1, 2, 3], [4, 5, 6]) - - def set_annotations(self, docs, scores, tensors=None): - return docs - - -@pytest.fixture -def nlp(): - return Language() - - -def test_multiple_predictions(nlp): - doc = nlp.make_doc("foo") - dummy_pipe = DummyPipe() - dummy_pipe(doc) diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py deleted file mode 100644 index 3bddc26ca..000000000 --- a/spacy/tests/regression/test_issue4313.py +++ /dev/null @@ -1,47 +0,0 @@ -from collections import defaultdict - -import pytest - -from spacy.pipeline.defaults import default_ner -from spacy.pipeline import EntityRecognizer - -from spacy.lang.en import English -from spacy.tokens import Span - - -# skipped after removing Beam stuff during the Example/GoldParse refactor -@pytest.mark.skip -def test_issue4313(): - """ This should not crash or exit with some strange error code """ - beam_width = 16 - beam_density = 0.0001 - nlp = English() - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - ner = EntityRecognizer(nlp.vocab, default_ner(), **config) - ner.add_label("SOME_LABEL") - ner.begin_training([]) - nlp.add_pipe(ner) - - # add a new label to the doc - doc = nlp("What do you think about Apple ?") - assert len(ner.labels) == 1 - assert "SOME_LABEL" in ner.labels - apple_ent = Span(doc, 5, 6, label="MY_ORG") - doc.ents = list(doc.ents) + [apple_ent] - - # ensure the beam_parse still works with the new label - docs = [doc] - beams = nlp.entity.beam_parse( - docs, beam_width=beam_width, beam_density=beam_density - ) - - for doc, beam in zip(docs, beams): - entity_scores = defaultdict(float) - for score, ents in nlp.entity.moves.get_beam_parses(beam): - for start, end, label in ents: - entity_scores[(start, end, label)] += score diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py deleted file mode 100644 index 06b03df24..000000000 --- a/spacy/tests/regression/test_issue4348.py +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.gold import Example -from spacy.lang.en import English -from spacy.util import minibatch -from thinc.api import compounding -import pytest - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_issue4348(): - """Test that training the tagger with empty data, doesn't throw errors""" - - nlp = English() - example = Example.from_dict(nlp.make_doc(""), {"tags": []}) - TRAIN_DATA = [example, example] - - tagger = nlp.create_pipe("tagger") - nlp.add_pipe(tagger) - - optimizer = nlp.begin_training() - for i in range(5): - losses = {} - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py deleted file mode 100644 index 917847a05..000000000 --- a/spacy/tests/regression/test_issue4367.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.tokens import DocBin - - -def test_issue4367(): - """Test that docbin init goes well""" - DocBin() - DocBin(attrs=["LEMMA"]) - DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py deleted file mode 100644 index dbde1624e..000000000 --- a/spacy/tests/regression/test_issue4373.py +++ /dev/null @@ -1,10 +0,0 @@ -from spacy.matcher import Matcher, PhraseMatcher -from spacy.vocab import Vocab - - -def test_issue4373(): - """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" - matcher = Matcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - matcher = PhraseMatcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py deleted file mode 100644 index 9c596aaf6..000000000 --- a/spacy/tests/regression/test_issue4402.py +++ /dev/null @@ -1,98 +0,0 @@ -from spacy.gold import Corpus -from spacy.lang.en import English - -from ..util import make_tempdir -from ...gold.converters import json2docs -from ...tokens import DocBin - - -def test_issue4402(): - nlp = English() - attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] - with make_tempdir() as tmpdir: - output_file = tmpdir / "test4402.spacy" - docs = json2docs([json_data]) - data = DocBin(docs=docs, attrs=attrs).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) - corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - - train_data = list(corpus.train_dataset(nlp)) - assert len(train_data) == 2 - - split_train_data = [] - for eg in train_data: - split_train_data.extend(eg.split_sents()) - assert len(split_train_data) == 4 - - -json_data = { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], -} diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py new file mode 100644 index 000000000..01d7a1dbb --- /dev/null +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -0,0 +1,288 @@ +import pytest +from mock import Mock +from spacy.pipeline import EntityRuler +from spacy.matcher import DependencyMatcher +from spacy.tokens import Doc, Span, DocBin +from spacy.gold import Example +from spacy.gold.converters.conllu2docs import conllu2docs +from spacy.lang.en import English +from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab +from spacy.language import Language +from spacy.util import ensure_path, load_model_from_path +import numpy +import pickle + +from ..util import get_doc, make_tempdir + + +def test_issue4528(en_vocab): + """Test that user_data is correctly serialized in DocBin.""" + doc = Doc(en_vocab, words=["hello", "world"]) + doc.user_data["foo"] = "bar" + # This is how extension attribute values are stored in the user data + doc.user_data[("._.", "foo", None, None)] = "bar" + doc_bin = DocBin(store_user_data=True) + doc_bin.add(doc) + doc_bin_bytes = doc_bin.to_bytes() + new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) + new_doc = list(new_doc_bin.get_docs(en_vocab))[0] + assert new_doc.user_data["foo"] == "bar" + assert new_doc.user_data[("._.", "foo", None, None)] == "bar" + + +@pytest.mark.parametrize( + "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] +) +def test_gold_misaligned(en_tokenizer, text, words): + doc = en_tokenizer(text) + Example.from_dict(doc, {"words": words}) + + +def test_issue4590(en_vocab): + """Test that matches param in on_match method are the same as matches run with no on_match method""" + pattern = [ + {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + ] + + on_match = Mock() + matcher = DependencyMatcher(en_vocab) + matcher.add("pattern", on_match, pattern) + text = "The quick brown fox jumped over the lazy fox" + heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] + doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) + matches = matcher(doc) + on_match_args = on_match.call_args + assert on_match_args[0][3] == matches + + +def test_issue4651_with_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +def test_issue4651_without_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + not specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +def test_issue4665(): + """ + conllu2json should not raise an exception if the HEAD column contains an + underscore + """ + input_data = """ +1 [ _ PUNCT -LRB- _ _ punct _ _ +2 This _ DET DT _ _ det _ _ +3 killing _ NOUN NN _ _ nsubj _ _ +4 of _ ADP IN _ _ case _ _ +5 a _ DET DT _ _ det _ _ +6 respected _ ADJ JJ _ _ amod _ _ +7 cleric _ NOUN NN _ _ nmod _ _ +8 will _ AUX MD _ _ aux _ _ +9 be _ AUX VB _ _ aux _ _ +10 causing _ VERB VBG _ _ root _ _ +11 us _ PRON PRP _ _ iobj _ _ +12 trouble _ NOUN NN _ _ dobj _ _ +13 for _ ADP IN _ _ case _ _ +14 years _ NOUN NNS _ _ nmod _ _ +15 to _ PART TO _ _ mark _ _ +16 come _ VERB VB _ _ acl _ _ +17 . _ PUNCT . _ _ punct _ _ +18 ] _ PUNCT -RRB- _ _ punct _ _ +""" + conllu2docs(input_data) + + +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) + assert kb.get_size_entities() == 1 + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.dump(str(file_path)) + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + kb2.load_bulk(str(file_path)) + assert kb2.get_size_entities() == 1 + + +def test_issue4707(): + """Tests that disabled component names are also excluded from nlp.from_disk + by default when loading a model. + """ + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(nlp.create_pipe("entity_ruler")) + assert nlp.pipe_names == ["sentencizer", "entity_ruler"] + exclude = ["tokenizer", "sentencizer"] + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir, exclude=exclude) + new_nlp = load_model_from_path(tmpdir, disable=exclude) + assert "sentencizer" not in new_nlp.pipe_names + assert "entity_ruler" in new_nlp.pipe_names + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4725_1(): + """ Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["min_action_freq"] == 342 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["min_action_freq"] == 342 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4725_2(): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + nlp.begin_training() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + + +def test_issue4849(): + nlp = English() + ruler = EntityRuler( + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, + ], + phrase_matcher_attr="LOWER", + ) + nlp.add_pipe(ruler) + text = """ + The left is starting to take aim at Democratic front-runner Joe Biden. + Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." + """ + # USING 1 PROCESS + count_ents = 0 + for doc in nlp.pipe([text], n_process=1): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + # USING 2 PROCESSES + count_ents = 0 + for doc in nlp.pipe([text], n_process=2): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + + +class CustomPipe: + name = "my_pipe" + + def __init__(self): + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +def test_issue4903(): + """Ensure that this runs correctly and doesn't hang or crash on Windows / + macOS.""" + nlp = English() + custom_component = CustomPipe() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(custom_component, after="sentencizer") + + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." + + +def test_issue4924(): + nlp = Language() + example = Example.from_dict(nlp.make_doc(""), {}) + nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py deleted file mode 100644 index 6f96c9f2d..000000000 --- a/spacy/tests/regression/test_issue4528.py +++ /dev/null @@ -1,16 +0,0 @@ -from spacy.tokens import Doc, DocBin - - -def test_issue4528(en_vocab): - """Test that user_data is correctly serialized in DocBin.""" - doc = Doc(en_vocab, words=["hello", "world"]) - doc.user_data["foo"] = "bar" - # This is how extension attribute values are stored in the user data - doc.user_data[("._.", "foo", None, None)] = "bar" - doc_bin = DocBin(store_user_data=True) - doc_bin.add(doc) - doc_bin_bytes = doc_bin.to_bytes() - new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) - new_doc = list(new_doc_bin.get_docs(en_vocab))[0] - assert new_doc.user_data["foo"] == "bar" - assert new_doc.user_data[("._.", "foo", None, None)] == "bar" diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py deleted file mode 100644 index 0708499de..000000000 --- a/spacy/tests/regression/test_issue4529.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest - -from spacy.gold import Example - - -@pytest.mark.parametrize( - "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] -) -def test_gold_misaligned(en_tokenizer, text, words): - doc = en_tokenizer(text) - Example.from_dict(doc, {"words": words}) diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py deleted file mode 100644 index fc49c5117..000000000 --- a/spacy/tests/regression/test_issue4590.py +++ /dev/null @@ -1,35 +0,0 @@ -from mock import Mock -from spacy.matcher import DependencyMatcher -from ..util import get_doc - - -def test_issue4590(en_vocab): - """Test that matches param in on_match method are the same as matches run with no on_match method""" - pattern = [ - {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - { - "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, - "PATTERN": {"ORTH": "fox"}, - }, - { - "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, - "PATTERN": {"ORTH": "fox"}, - }, - ] - - on_match = Mock() - - matcher = DependencyMatcher(en_vocab) - matcher.add("pattern", on_match, pattern) - - text = "The quick brown fox jumped over the lazy fox" - heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] - deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] - - doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) - - matches = matcher(doc) - - on_match_args = on_match.call_args - - assert on_match_args[0][3] == matches diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py deleted file mode 100644 index 3f6c1a57c..000000000 --- a/spacy/tests/regression/test_issue4651.py +++ /dev/null @@ -1,62 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - -from ..util import make_tempdir - - -def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - specified. - """ - text = "Spacy is a python library for nlp" - - nlp = English() - ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) - - nlp_reloaded.add_pipe(ruler_reloaded) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - - assert res == res_reloaded - - -def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - not specified. - """ - text = "Spacy is a python library for nlp" - - nlp = English() - ruler = EntityRuler(nlp) - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) - - nlp_reloaded.add_pipe(ruler_reloaded) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - - assert res == res_reloaded diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py deleted file mode 100644 index e28d0f44a..000000000 --- a/spacy/tests/regression/test_issue4665.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest - -# TODO -# from spacy.gold.converters.conllu2docs import conllu2docs - -input_data = """ -1 [ _ PUNCT -LRB- _ _ punct _ _ -2 This _ DET DT _ _ det _ _ -3 killing _ NOUN NN _ _ nsubj _ _ -4 of _ ADP IN _ _ case _ _ -5 a _ DET DT _ _ det _ _ -6 respected _ ADJ JJ _ _ amod _ _ -7 cleric _ NOUN NN _ _ nmod _ _ -8 will _ AUX MD _ _ aux _ _ -9 be _ AUX VB _ _ aux _ _ -10 causing _ VERB VBG _ _ root _ _ -11 us _ PRON PRP _ _ iobj _ _ -12 trouble _ NOUN NN _ _ dobj _ _ -13 for _ ADP IN _ _ case _ _ -14 years _ NOUN NNS _ _ nmod _ _ -15 to _ PART TO _ _ mark _ _ -16 come _ VERB VB _ _ acl _ _ -17 . _ PUNCT . _ _ punct _ _ -18 ] _ PUNCT -RRB- _ _ punct _ _ -""" - - -@pytest.mark.xfail -def test_issue4665(): - """ - conllu2json should not raise an exception if the HEAD column contains an - underscore - """ - pass - # conllu2json(input_data) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py deleted file mode 100644 index 149e1431b..000000000 --- a/spacy/tests/regression/test_issue4674.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest -from spacy.kb import KnowledgeBase -from spacy.util import ensure_path -from spacy.lang.en import English - -from ..util import make_tempdir - - -def test_issue4674(): - """Test that setting entities with overlapping identifiers does not mess up IO""" - nlp = English() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) - - vector1 = [0.9, 1.1, 1.01] - vector2 = [1.8, 2.25, 2.01] - with pytest.warns(UserWarning): - kb.set_entities( - entity_list=["Q1", "Q1"], - freq_list=[32, 111], - vector_list=[vector1, vector2], - ) - - assert kb.get_size_entities() == 1 - - # dumping to file & loading back in - with make_tempdir() as d: - dir_path = ensure_path(d) - if not dir_path.exists(): - dir_path.mkdir() - file_path = dir_path / "kb" - kb.dump(str(file_path)) - - kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) - kb2.load_bulk(str(file_path)) - - assert kb2.get_size_entities() == 1 diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py deleted file mode 100644 index d9798ef84..000000000 --- a/spacy/tests/regression/test_issue4707.py +++ /dev/null @@ -1,20 +0,0 @@ -from spacy.util import load_model_from_path -from spacy.lang.en import English - -from ..util import make_tempdir - - -def test_issue4707(): - """Tests that disabled component names are also excluded from nlp.from_disk - by default when loading a model. - """ - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(nlp.create_pipe("entity_ruler")) - assert nlp.pipe_names == ["sentencizer", "entity_ruler"] - exclude = ["tokenizer", "sentencizer"] - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir, exclude=exclude) - new_nlp = load_model_from_path(tmpdir, disable=exclude) - assert "sentencizer" not in new_nlp.pipe_names - assert "entity_ruler" in new_nlp.pipe_names diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py deleted file mode 100644 index cdc3c09ca..000000000 --- a/spacy/tests/regression/test_issue4725.py +++ /dev/null @@ -1,41 +0,0 @@ -import pickle -import numpy - -from spacy.lang.en import English -from spacy.vocab import Vocab - -from spacy.tests.util import make_tempdir - - -def test_pickle_ner(): - """ Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") - nlp = English(vocab=vocab) - ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) - with make_tempdir() as tmp_path: - with (tmp_path / "ner.pkl").open("wb") as file_: - pickle.dump(ner, file_) - assert ner.cfg["min_action_freq"] == 342 - - with (tmp_path / "ner.pkl").open("rb") as file_: - ner2 = pickle.load(file_) - assert ner2.cfg["min_action_freq"] == 342 - - -def test_issue4725(): - # ensures that this runs correctly and doesn't hang or crash because of the global vectors - # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) - vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") - data[0] = 1.0 - data[1] = 2.0 - vocab.set_vector("cat", data[0]) - vocab.set_vector("dog", data[1]) - - nlp = English(vocab=vocab) - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - nlp.begin_training() - docs = ["Kurt is in London."] * 10 - for _ in nlp.pipe(docs, batch_size=2, n_process=2): - pass diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py deleted file mode 100644 index ddbf6f7a0..000000000 --- a/spacy/tests/regression/test_issue4849.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - - -def test_issue4849(): - nlp = English() - - ruler = EntityRuler( - nlp, - patterns=[ - {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, - {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, - ], - phrase_matcher_attr="LOWER", - ) - - nlp.add_pipe(ruler) - - text = """ - The left is starting to take aim at Democratic front-runner Joe Biden. - Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." - """ - - # USING 1 PROCESS - count_ents = 0 - for doc in nlp.pipe([text], n_process=1): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - - # USING 2 PROCESSES - count_ents = 0 - for doc in nlp.pipe([text], n_process=2): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py deleted file mode 100644 index a3dff16aa..000000000 --- a/spacy/tests/regression/test_issue4903.py +++ /dev/null @@ -1,40 +0,0 @@ -from spacy.lang.en import English -from spacy.tokens import Span, Doc - - -class CustomPipe: - name = "my_pipe" - - def __init__(self): - Span.set_extension("my_ext", getter=self._get_my_ext) - Doc.set_extension("my_ext", default=None) - - def __call__(self, doc): - gathered_ext = [] - for sent in doc.sents: - sent_ext = self._get_my_ext(sent) - sent._.set("my_ext", sent_ext) - gathered_ext.append(sent_ext) - - doc._.set("my_ext", "\n".join(gathered_ext)) - - return doc - - @staticmethod - def _get_my_ext(span): - return str(span.end) - - -def test_issue4903(): - # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - - nlp = English() - custom_component = CustomPipe() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(custom_component, after="sentencizer") - - text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - docs = list(nlp.pipe(text, n_process=2)) - assert docs[0].text == "I like bananas." - assert docs[1].text == "Do you like them?" - assert docs[2].text == "No, I prefer wasabi." diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py deleted file mode 100644 index c3d3c4326..000000000 --- a/spacy/tests/regression/test_issue4924.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.gold import Example -from spacy.language import Language - - -def test_issue4924(): - nlp = Language() - example = Example.from_dict(nlp.make_doc(""), {}) - nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..3c1cee5c3 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,6 +1,8 @@ +import pytest from spacy.lang.en import English +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue5152(): # Test that the comparison between a Span and a Token, goes well # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) @@ -8,7 +10,6 @@ def test_issue5152(): text = nlp("Talk about being boring!") text_var = nlp("Talk of being boring!") y = nlp("Let") - span = text[0:3] # Talk about being span_2 = text[0:3] # Talk about being span_3 = text_var[0:3] # Talk of being diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9ffa3862c..86020bf17 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -63,7 +63,8 @@ def tagger(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - tagger.begin_training(pipeline=nlp.pipeline) + with pytest.warns(UserWarning): + tagger.begin_training(pipeline=nlp.pipeline) return tagger diff --git a/spacy/tests/regression/test_issue5551.py b/spacy/tests/regression/test_issue5551.py new file mode 100644 index 000000000..a8be4cab4 --- /dev/null +++ b/spacy/tests/regression/test_issue5551.py @@ -0,0 +1,31 @@ +from spacy.lang.en import English +from spacy.util import fix_random_seed + + +def test_issue5551(): + """Test that after fixing the random seed, the results of the pipeline are truly identical""" + component = "textcat" + pipe_cfg = {"exclusive_classes": False} + + results = [] + for i in range(3): + fix_random_seed(0) + nlp = English() + example = ( + "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.", + {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}, + ) + nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True) + pipe = nlp.get_pipe(component) + for label in set(example[1]["cats"]): + pipe.add_label(label) + nlp.begin_training(component_cfg={component: pipe_cfg}) + + # Store the result of each iteration + result = pipe.model.predict([nlp.make_doc(example[0])]) + results.append(list(result[0])) + + # All results should be the same because of the fixed seed + assert len(results) == 3 + assert results[0] == results[1] + assert results[0] == results[2] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 24f2bbc13..0b0ba5cad 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,3 +1,4 @@ +import numpy from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo @@ -5,6 +6,7 @@ from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.gold.converters import json2docs from spacy.lang.en import English +from spacy.pipeline import EntityRuler from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch from thinc.api import compounding @@ -153,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_example_constructor(en_vocab): + words = ["I", "like", "stuff"] + tags = ["NOUN", "VERB", "NOUN"] + tag_ids = [en_vocab.strings.add(tag) for tag in tags] + predicted = Doc(en_vocab, words=words) + reference = Doc(en_vocab, words=words) + reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) + example = Example(predicted, reference) + tags = example.get_aligned("TAG", as_string=True) + assert tags == ["NOUN", "VERB", "NOUN"] + + +def test_example_from_dict_tags(en_vocab): + words = ["I", "like", "stuff"] + tags = ["NOUN", "VERB", "NOUN"] + predicted = Doc(en_vocab, words=words) + example = Example.from_dict(predicted, {"TAGS": tags}) + tags = example.get_aligned("TAG", as_string=True) + assert tags == ["NOUN", "VERB", "NOUN"] + + def test_example_from_dict_no_ner(en_vocab): words = ["a", "b", "c", "d"] spaces = [True, True, False, True] @@ -272,72 +295,72 @@ def test_split_sentences(en_vocab): def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): - words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] + words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - prefix = "Mr. and Mrs. Smith flew to " + prefix = "Mr and Mrs Smith flew to " entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] - gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "U-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person + (len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", None, "O", "U-LOC", "O"] def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): - words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - prefix = "Mr. and Mrs. Smith flew to " + prefix = "Mr and Mrs Smith flew to " entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] - gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] + gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] + gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] def test_gold_biluo_misaligned(en_vocab, en_tokenizer): - words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] + words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - prefix = "Mr. and Mrs. Smith flew to " + prefix = "Mr and Mrs Smith flew to " entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] - gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] + gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] + gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] @@ -407,6 +430,49 @@ def test_biluo_spans(en_tokenizer): assert spans[1].label_ == "GPE" +def test_aligned_spans_y2x(en_vocab, en_tokenizer): + words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."] + spaces = [True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + prefix = "Mr and Mrs Smith flew to " + entities = [ + (0, len("Mr and Mrs Smith"), "PERSON"), + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) + ents_ref = example.reference.ents + assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)] + ents_y2x = example.get_aligned_spans_y2x(ents_ref) + assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)] + + +def test_aligned_spans_x2y(en_vocab, en_tokenizer): + text = "Mr and Mrs Smith flew to San Francisco Valley" + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"}, + {"label": "LOC", "pattern": "San Francisco Valley"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)] + prefix = "Mr and Mrs Smith flew to " + entities = [ + (0, len("Mr and Mrs Smith"), "PERSON"), + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"] + example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) + assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)] + + # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct + ents_pred = example.predicted.ents + assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)] + ents_x2y = example.get_aligned_spans_x2y(ents_pred) + assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)] + + def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] @@ -414,6 +480,16 @@ def test_gold_ner_missing_tags(en_tokenizer): assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] +def test_projectivize(en_tokenizer): + doc = en_tokenizer("He pretty quickly walks away") + heads = [3, 2, 3, 0, 2] + example = Example.from_dict(doc, {"heads": heads}) + proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) + nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) + assert proj_heads == [3, 2, 3, 0, 3] + assert nonproj_heads == [3, 2, 3, 0, 2] + + def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py new file mode 100644 index 000000000..c3270c556 --- /dev/null +++ b/spacy/tests/test_models.py @@ -0,0 +1,156 @@ +from typing import List + +import pytest +from thinc.api import fix_random_seed, Adam, set_dropout_rate +from numpy.testing import assert_array_equal +import numpy + +from spacy.ml.models import build_Tok2Vec_model +from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier +from spacy.lang.en import English +from spacy.lang.en.examples import sentences as EN_SENTENCES + + +def get_all_params(model): + params = [] + for node in model.walk(): + for name in node.param_names: + params.append(node.get_param(name).ravel()) + return node.ops.xp.concatenate(params) + + +def get_docs(): + nlp = English() + return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)])) + + +def get_gradient(model, Y): + if isinstance(Y, model.ops.xp.ndarray): + dY = model.ops.alloc(Y.shape, dtype=Y.dtype) + dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape) + return dY + elif isinstance(Y, List): + return [get_gradient(model, y) for y in Y] + else: + raise ValueError(f"Could not get gradient for type {type(Y)}") + + +def default_tok2vec(): + return build_Tok2Vec_model(**TOK2VEC_KWARGS) + + +TOK2VEC_KWARGS = { + "width": 96, + "embed_size": 2000, + "subword_features": True, + "char_embed": False, + "conv_depth": 4, + "bilstm_depth": 0, + "maxout_pieces": 4, + "window_size": 1, + "dropout": 0.1, + "nM": 0, + "nC": 0, + "pretrained_vectors": None, +} + +TEXTCAT_KWARGS = { + "width": 64, + "embed_size": 2000, + "pretrained_vectors": None, + "exclusive_classes": False, + "ngram_size": 1, + "window_size": 1, + "conv_depth": 2, + "dropout": None, + "nO": 7 +} + +TEXTCAT_CNN_KWARGS = { + "tok2vec": default_tok2vec(), + "exclusive_classes": False, + "nO": 13, +} + + +@pytest.mark.parametrize( + "seed,model_func,kwargs", + [ + (0, build_Tok2Vec_model, TOK2VEC_KWARGS), + (0, build_text_classifier, TEXTCAT_KWARGS), + (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS), + ], +) +def test_models_initialize_consistently(seed, model_func, kwargs): + fix_random_seed(seed) + model1 = model_func(**kwargs) + model1.initialize() + fix_random_seed(seed) + model2 = model_func(**kwargs) + model2.initialize() + params1 = get_all_params(model1) + params2 = get_all_params(model2) + assert_array_equal(params1, params2) + + +@pytest.mark.parametrize( + "seed,model_func,kwargs,get_X", + [ + (0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), + (0, build_text_classifier, TEXTCAT_KWARGS, get_docs), + (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), + ], +) +def test_models_predict_consistently(seed, model_func, kwargs, get_X): + fix_random_seed(seed) + model1 = model_func(**kwargs).initialize() + Y1 = model1.predict(get_X()) + fix_random_seed(seed) + model2 = model_func(**kwargs).initialize() + Y2 = model2.predict(get_X()) + + if model1.has_ref("tok2vec"): + tok2vec1 = model1.get_ref("tok2vec").predict(get_X()) + tok2vec2 = model2.get_ref("tok2vec").predict(get_X()) + for i in range(len(tok2vec1)): + for j in range(len(tok2vec1[i])): + assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j])) + + if isinstance(Y1, numpy.ndarray): + assert_array_equal(Y1, Y2) + elif isinstance(Y1, List): + assert len(Y1) == len(Y2) + for y1, y2 in zip(Y1, Y2): + assert_array_equal(y1, y2) + else: + raise ValueError(f"Could not compare type {type(Y1)}") + + +@pytest.mark.parametrize( + "seed,dropout,model_func,kwargs,get_X", + [ + (0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), + (0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs), + (0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), + ], +) +def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): + def get_updated_model(): + fix_random_seed(seed) + optimizer = Adam(0.001) + model = model_func(**kwargs).initialize() + initial_params = get_all_params(model) + set_dropout_rate(model, dropout) + for _ in range(5): + Y, get_dX = model.begin_update(get_X()) + dY = get_gradient(model, Y) + _ = get_dX(dY) + model.finish_update(optimizer) + updated_params = get_all_params(model) + with pytest.raises(AssertionError): + assert_array_equal(initial_params, updated_params) + return model + + model1 = get_updated_model() + model2 = get_updated_model() + assert_array_equal(get_all_params(model1), get_all_params(model2)) diff --git a/spacy/tests/test_projects.py b/spacy/tests/test_projects.py new file mode 100644 index 000000000..c3477f463 --- /dev/null +++ b/spacy/tests/test_projects.py @@ -0,0 +1,31 @@ +import pytest +from spacy.cli.project.util import validate_project_commands +from spacy.schemas import ProjectConfigSchema, validate + + +@pytest.mark.parametrize( + "config", + [ + {"commands": [{"name": "a"}, {"name": "a"}]}, + {"commands": [{"name": "a"}], "workflows": {"a": []}}, + {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}}, + ], +) +def test_project_config_validation1(config): + with pytest.raises(SystemExit): + validate_project_commands(config) + + +@pytest.mark.parametrize( + "config,n_errors", + [ + ({"commands": {"a": []}}, 1), + ({"commands": [{"help": "..."}]}, 1), + ({"commands": [{"name": "a", "extra": "b"}]}, 1), + ({"commands": [{"extra": "b"}]}, 2), + ({"commands": [{"name": "a", "deps": [123]}]}, 1), + ], +) +def test_project_config_validation2(config, n_errors): + errors = validate(ProjectConfigSchema, config) + assert len(errors) == n_errors diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ca9230d98..f28bd3374 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -803,7 +803,7 @@ cdef class Doc: attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] if array.dtype != numpy.uint64: - warnings.warn(Warnings.W101.format(type=array.dtype)) + warnings.warn(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) diff --git a/spacy/util.py b/spacy/util.py index 4a17b7f24..4ed002f37 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,7 +20,6 @@ import subprocess from contextlib import contextmanager import tempfile import shutil -import hashlib import shlex try: @@ -449,6 +448,16 @@ def split_command(command: str) -> List[str]: return shlex.split(command, posix=not is_windows) +def join_command(command: List[str]) -> str: + """Join a command using shlex. shlex.join is only available for Python 3.8+, + so we're using a workaround here. + + command (List[str]): The command to join. + RETURNS (str): The joined command + """ + return " ".join(shlex.quote(cmd) for cmd in command) + + def run_command(command: Union[str, List[str]]) -> None: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -501,23 +510,13 @@ def make_tempdir(): warnings.warn(Warnings.W091.format(dir=d, msg=e)) -def get_hash(data) -> str: - """Get the hash for a JSON-serializable object. +def is_cwd(path: Union[Path, str]) -> bool: + """Check whether a path is the current working directory. - data: The data to hash. - RETURNS (str): The hash. + path (Union[Path, str]): The directory path. + RETURNS (bool): Whether the path is the current working directory. """ - data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") - return hashlib.md5(data_str).hexdigest() - - -def get_checksum(path: Union[Path, str]) -> str: - """Get the checksum for a file given its file path. - - path (Union[Path, str]): The file path. - RETURNS (str): The checksum. - """ - return hashlib.md5(Path(path).read_bytes()).hexdigest() + return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower() def is_in_jupyter(): @@ -722,6 +721,51 @@ def minibatch(items, size=8): yield list(batch) +def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False): + if isinstance(size, int): + size_ = itertools.repeat(size) + else: + size_ = size + for outer_batch in minibatch(docs, buffer): + outer_batch = list(outer_batch) + target_size = next(size_) + for indices in _batch_by_length(outer_batch, target_size): + subbatch = [outer_batch[i] for i in indices] + padded_size = max(len(seq) for seq in subbatch) * len(subbatch) + if discard_oversize and padded_size >= target_size: + pass + else: + yield subbatch + + +def _batch_by_length(seqs, max_words): + """Given a list of sequences, return a batched list of indices into the + list, where the batches are grouped by length, in descending order. + + Batches may be at most max_words in size, defined as max sequence length * size. + """ + # Use negative index so we can get sort by position ascending. + lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)] + lengths_indices.sort() + batches = [] + batch = [] + for length, i in lengths_indices: + if not batch: + batch.append(i) + elif length * (len(batch) + 1) <= max_words: + batch.append(i) + else: + batches.append(batch) + batch = [i] + if batch: + batches.append(batch) + # Check lengths match + assert sum(len(b) for b in batches) == len(seqs) + batches = [list(sorted(batch)) for batch in batches] + batches.reverse() + return batches + + def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by @@ -768,7 +812,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): # yield the previous batch and start a new one. The new one gets the overflow examples. else: - yield batch + if batch: + yield batch target_size = next(size_) tol_size = target_size * tolerance batch = overflow @@ -788,15 +833,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): # this example does not fit with the previous overflow: start another new batch else: - yield batch + if batch: + yield batch target_size = next(size_) tol_size = target_size * tolerance batch = [doc] batch_size = n_words - # yield the final batch + batch.extend(overflow) if batch: - batch.extend(overflow) yield batch diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index f463d6df2..b15f7b119 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library source: spacy/ml/models --- -TODO: write +TODO: intro and how architectures work, link to +[`registry`](/api/top-level#registry), +[custom models](/usage/training#custom-models) usage etc. + +## Parser architectures {source="spacy/ml/models/parser.py"} + +### spacy.TransitionBasedParser.v1 + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TransitionBasedParser.v1" +> nr_feature_tokens = 6 +> hidden_width = 64 +> maxout_pieces = 2 +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| ------------------- | ------------------------------------------ | ----------- | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | | +| `nr_feature_tokens` | int | | +| `hidden_width` | int | | +| `maxout_pieces` | int | | +| `use_upper` | bool | | +| `nO` | int | | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e6036d5be..03cd9ba3f 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -297,60 +297,41 @@ will not be available. ## Train {#train} - - Train a model. Expects data in spaCy's -[JSON format](/api/data-formats#json-input). On each epoch, a model will be -saved out to the directory. Accuracy scores and model details will be added to a -[`meta.json`](/usage/training#models-generating) to allow packaging the model -using the [`package`](/api/cli#package) command. +[binary format](/api/data-formats#training) and a +[config file](/api/data-formats#config) with all settings and hyperparameters. +Will save out the best model from all epochs, as well as the final model. The +`--code` argument can be used to provide a Python file that's imported before +the training process starts. This lets you register +[custom functions](/usage/training#custom-models) and architectures and refer to +them in your config, all while still using spaCy's built-in `train` workflow. If +you need to manage complex multi-step training workflows, check out the new +[spaCy projects](/usage/projects). + + + +As of spaCy v3.0, the `train` command doesn't take a long list of command-line +arguments anymore and instead expects a single +[`config.cfg` file](/usage/training#config) containing all settings for the +pipeline, training process and hyperparameters. + + ```bash -$ python -m spacy train [lang] [output_path] [train_path] [dev_path] -[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] -[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec] -[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level] -[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel] -[--textcat-positive-label] [--verbose] +$ python -m spacy train [train_path] [dev_path] [config_path] [--output] +[--code] [--verbose] ``` -| Argument | Type | Description | -| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--replace-components`, `-R` | flag | Replace components from the base model. | -| `--vectors`, `-v` | option | Model to load vectors from. | -| `--n-iter`, `-n` | option | Number of iterations (default: `30`). | -| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | -| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). | -| `--use-gpu`, `-g` | option | GPU ID or `-1` for CPU only (default: `-1`). | -| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. | -| `--meta-path`, `-m` 2 | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. | -| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | -| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | -| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | -| `--width`, `-cw` 2.2.4 | option | Width of CNN layers of `Tok2Vec` component. | -| `--conv-depth`, `-cd` 2.2.4 | option | Depth of CNN layers of `Tok2Vec` component. | -| `--cnn-window`, `-cW` 2.2.4 | option | Window size for CNN layers of `Tok2Vec` component. | -| `--cnn-pieces`, `-cP` 2.2.4 | option | Maxout size for CNN layers of `Tok2Vec` component. | -| `--use-chars`, `-chr` 2.2.4 | flag | Whether to use character-based embedding of `Tok2Vec` component. | -| `--bilstm-depth`, `-lstm` 2.2.4 | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). | -| `--embed-rows`, `-er` 2.2.4 | option | Number of embedding rows of `Tok2Vec` component. | -| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | -| `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). | -| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | -| `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). | -| `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. | -| `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. | -| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | -| `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | model, pickle | A spaCy model on each epoch. | +| Argument | Type | Description | +| ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. | +| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. | +| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | +| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. | +| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | +| `--verbose`, `-V` | flag | Show more detailed messages during training. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | model | The final model and the best model. | ## Pretrain {#pretrain new="2.1" tag="experimental"} @@ -471,20 +452,20 @@ as separate files if the respective component is present in the model's pipeline. ```bash -$ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] -[--gpu-id] [--gold-preproc] [--return-scores] +$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path] +[--displacy-limit] [--gpu-id] [--gold-preproc] ``` -| Argument | Type | Description | -| ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. | -| `data_path` | positional | Location of JSON-formatted evaluation data. | -| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. | -| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | -| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. | -| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--return-scores`, `-R` | flag | Return dict containing model scores. | -| **CREATES** | `stdout`, HTML | Training results and optional displaCy visualizations. | +| Argument | Type | Description | +| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. | +| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). | +| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. | +| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. | +| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | +| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. | +| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | +| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. | ## Package {#package} @@ -504,15 +485,17 @@ so you don't have to run `python setup.py sdist` separately anymore. ```bash -$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] +$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] +[--version] [--force] ``` -```bash -### Example -python -m spacy package /input /output -cd /output/en_model-0.0.0 -pip install dist/en_model-0.0.0.tar.gz -``` +> #### Example +> +> ```bash +> python -m spacy package /input /output +> cd /output/en_model-0.0.0 +> pip install dist/en_model-0.0.0.tar.gz +> ``` | Argument | Type | Description | | ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -525,18 +508,137 @@ pip install dist/en_model-0.0.0.tar.gz | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | -## Project {#project} +## Project {#project new="3"} - +The `spacy project` CLI includes subcommands for working with +[spaCy projects](/usage/projects), end-to-end workflows for building and +deploying custom spaCy models. ### project clone {#project-clone} +Clone a project template from a Git repository. Calls into `git` under the hood +and uses the sparse checkout feature, so you're only downloading what you need. +By default, spaCy's +[project templates repo](https://github.com/explosion/projects) is used, but you +can provide any other repo (public or private) that you have access to using the +`--repo` option. + + + +```bash +$ python -m spacy project clone [name] [dest] [--repo] +``` + +> #### Example +> +> ```bash +> $ python -m spacy project clone some_example +> ``` +> +> Clone from custom repo: +> +> ```bash +> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo +> ``` + +| Argument | Type | Description | +| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- | +| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. | +| `dest` | positional | Where to clone the project. Defaults to current working directory. | +| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). | + ### project assets {#project-assets} -### project run-all {#project-run-all} +Fetch project assets like datasets and pretrained weights. Assets are defined in +the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a +`checksum` is provided, the file is only downloaded if no local file with the +same checksum exists and spaCy will show an error if the checksum of the +downloaded file doesn't match. If assets don't specify a `url` they're +considered "private" and you have to take care of putting them into the +destination directory yourself. If a local path is provided, the asset is copied +into the current project. + +```bash +$ python -m spacy project assets [project_dir] +``` + +> #### Example +> +> ```bash +> $ python -m spacy project assets +> ``` + +| Argument | Type | Description | +| -------------- | ---------- | ----------------------------------------------------------------- | +| `project_dir` | positional | Path to project directory. Defaults to current working directory. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. | ### project run {#project-run} -### project init {#project-init} +Run a named command or workflow defined in the +[`project.yml`](/usage/projects#project-yml). If a workflow name is specified, +all commands in the workflow are run, in order. If commands define +[dependencies or outputs](/usage/projects#deps-outputs), they will only be +re-run if state has changed. For example, if the input dataset changes, a +preprocessing command that depends on those files will be re-run. -### project update-dvc {#project-update-dvc} +```bash +$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry] +``` + +> #### Example +> +> ```bash +> $ python -m spacy project run train +> ``` + +| Argument | Type | Description | +| --------------- | ---------- | ----------------------------------------------------------------- | +| `subcommand` | positional | Name of the command or workflow to run. | +| `project_dir` | positional | Path to project directory. Defaults to current working directory. | +| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. | +| `--dry`, `-D` | flag |  Perform a dry run and don't execute scripts. | +| `--help`, `-h` | flag | Show help message and available arguments. | + +### project dvc {#project-dvc} + +Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls +[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under +the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline, +so you need to specify one workflow defined in the +[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the +first defined workflow is used. The DVC config will only be updated if the +`project.yml` changed. For details, see the +[DVC integration](/usage/projects#dvc) docs. + + + +This command requires DVC to be installed and initialized in the project +directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init). +You'll also need to add the assets you want to track with +[`dvc add`](https://dvc.org/doc/command-reference/add). + + + +```bash +$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] +``` + +> #### Example +> +> ```bash +> git init +> dvc init +> python -m spacy project dvc all +> ``` + +| Argument | Type | Description | +| ----------------- | ---------- | --------------------------------------------------------------------------------- | +| `project_dir` | positional | Path to project directory. Defaults to current working directory. | +| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. | +| `--force`, `-F` | flag | Force-updating config file. | +| `--verbose`, `-V` | flag |  Print more output generated by DVC. | +| `--help`, `-h` | flag | Show help message and available arguments. | diff --git a/website/docs/api/cython.md b/website/docs/api/cython.md index f91909747..d7c03cf41 100644 --- a/website/docs/api/cython.md +++ b/website/docs/api/cython.md @@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to follow — only to succumb themselves. In short, just say no to optimizing your Python. If it's not fast enough the first time, just switch to Cython. - + - [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 5b122a2e2..10fef6ba6 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -2,7 +2,8 @@ title: Data formats teaser: Details on spaCy's input and output data formats menu: - - ['Training data', 'training'] + - ['Training Data', 'training'] + - ['Training Config', 'config'] - ['Vocabulary', 'vocab'] --- @@ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank: https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json ``` +## Training config {#config new="3"} + +Config files define the training process and model pipeline and can be passed to +[`spacy train`](/api/cli#train). They use +[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the +hood. For details on how to use training configs, see the +[usage documentation](/usage/training#config). + + + +The `@` syntax lets you refer to function names registered in the +[function registry](/api/top-level#registry). For example, +`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of +the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block +will be passed into that function as arguments. Those arguments depend on the +registered function. See the [model architectures](/api/architectures) docs for +API details. + + + + + + ## Lexical data for vocabulary {#vocab-jsonl new="2"} To populate a model's vocabulary, you can use the diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 0980dc2e0..135caf0c2 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline component is available in the [processing pipeline](/usage/processing-pipelines) via the ID `"parser"`. -## DependencyParser.Model {#model tag="classmethod"} +## Default config {#config} -Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API. Wrappers are under development for most major machine -learning libraries. +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. -| Name | Type | Description | -| ----------- | ------ | ------------------------------------- | -| `**kwargs` | - | Parameters for initializing the model | -| **RETURNS** | object | The initialized model. | +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg +``` ## DependencyParser.\_\_init\_\_ {#init tag="method"} +> #### Example +> +> ```python +> # Construction via create_pipe with default model +> parser = nlp.create_pipe("parser") +> +> # Construction via create_pipe with custom model +> config = {"model": {"@architectures": "my_parser"}} +> parser = nlp.create_pipe("parser", config) +> +> # Construction from class with custom model from file +> from spacy.pipeline import DependencyParser +> model = util.load_config("model.cfg", create_objects=True)["model"] +> parser = DependencyParser(nlp.vocab, model) +> ``` + Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.create_pipe`](/api/language#create_pipe). -> #### Example -> -> ```python -> # Construction via create_pipe -> parser = nlp.create_pipe("parser") -> -> # Construction from class -> from spacy.pipeline import DependencyParser -> parser = DependencyParser(nlp.vocab) -> parser.from_disk("/path/to/model") -> ``` - -| Name | Type | Description | -| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `**cfg` | - | Configuration parameters. | -| **RETURNS** | `DependencyParser` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ------------------ | ------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `**cfg` | - | Configuration parameters. | +| **RETURNS** | `DependencyParser` | The newly constructed object. | ## DependencyParser.\_\_call\_\_ {#call tag="method"} @@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and > pass > ``` -| Name | Type | Description | -| ------------ | -------- | ------------------------------------------------------ | -| `stream` | iterable | A stream of documents. | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Type | Description | +| ------------ | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | ## DependencyParser.predict {#predict tag="method"} @@ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. | Name | Type | Description | | ----------- | ------------------- | ---------------------------------------------- | -| `docs` | iterable | The documents to predict. | +| `docs` | `Iterable[Doc]` | The documents to predict. | | **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). | ## DependencyParser.set_annotations {#set_annotations tag="method"} @@ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores. > parser.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| -------- | -------- | ---------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `DependencyParser.predict`. | +| Name | Type | Description | +| -------- | ------------------- | ---------------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. | ## DependencyParser.update {#update tag="method"} -Learn from a batch of documents and gold-standard information, updating the -pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and +Learn from a batch of [`Example`](/api/example) objects, updating the pipe's +model. Delegates to [`predict`](/api/dependencyparser#predict) and [`get_loss`](/api/dependencyparser#get_loss). > #### Example > > ```python -> parser = DependencyParser(nlp.vocab) -> losses = {} +> parser = DependencyParser(nlp.vocab, parser_model) > optimizer = nlp.begin_training() -> parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) +> losses = parser.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | -------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate. | -| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | +| Name | Type | Description | +| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). | +| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## DependencyParser.get_loss {#get_loss tag="method"} @@ -156,21 +162,20 @@ predicted scores. > > ```python > parser = DependencyParser(nlp.vocab) -> scores = parser.predict([doc1, doc2]) -> loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores) +> scores = parser.predict([eg.predicted for eg in examples]) +> loss, d_loss = parser.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------- | +| `examples` | `Iterable[Example]` | The batch of examples. | +| `scores` | `syntax.StateClass` | Scores representing the model's predictions. | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | ## DependencyParser.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. +Initialize the pipe for training, using data examples if available. Return an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > @@ -180,16 +185,17 @@ has been initialized yet, the model is added. > optimizer = parser.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | +| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. | +| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. | +| **RETURNS** | `Optimizer` | An optimizer. | ## DependencyParser.create_optimizer {#create_optimizer tag="method"} -Create an optimizer for the pipeline component. +Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline +component. > #### Example > @@ -198,9 +204,9 @@ Create an optimizer for the pipeline component. > optimizer = parser.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | -------- | -------------- | -| **RETURNS** | callable | The optimizer. | +| Name | Type | Description | +| ----------- | ----------- | --------------------------------------------------------------- | +| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ## DependencyParser.use_params {#use_params tag="method, contextmanager"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index d7f25ed56..b77fc059d 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline component is available in the [processing pipeline](/usage/processing-pipelines) via the ID `"entity_linker"`. -## EntityLinker.Model {#model tag="classmethod"} +## Default config {#config} -Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the -context encoder. Wrappers are under development for most major machine learning -libraries. +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. -| Name | Type | Description | -| ----------- | ------ | ------------------------------------- | -| `**kwargs` | - | Parameters for initializing the model | -| **RETURNS** | object | The initialized model. | +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg +``` ## EntityLinker.\_\_init\_\_ {#init tag="method"} +> #### Example +> +> ```python +> # Construction via create_pipe with default model +> entity_linker = nlp.create_pipe("entity_linker") +> +> # Construction via create_pipe with custom model +> config = {"model": {"@architectures": "my_el"}} +> entity_linker = nlp.create_pipe("entity_linker", config) +> +> # Construction from class with custom model from file +> from spacy.pipeline import EntityLinker +> model = util.load_config("model.cfg", create_objects=True)["model"] +> entity_linker = EntityLinker(nlp.vocab, model) +> ``` + Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.create_pipe`](/api/language#create_pipe). -> #### Example -> -> ```python -> # Construction via create_pipe -> entity_linker = nlp.create_pipe("entity_linker") -> -> # Construction from class -> from spacy.pipeline import EntityLinker -> entity_linker = EntityLinker(nlp.vocab) -> entity_linker.from_disk("/path/to/model") -> ``` +| Name | Type | Description | +| ------- | ------- | ------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `**cfg` | - | Configuration parameters. | -| Name | Type | Description | -| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to `128`. | -| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to `True`. | -| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`. | -| **RETURNS** | `EntityLinker` | The newly constructed object. | +| **RETURNS** | `EntityLinker` | The newly constructed object. | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and > pass > ``` -| Name | Type | Description | -| ------------ | -------- | ------------------------------------------------------ | -| `stream` | iterable | A stream of documents. | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Type | Description | +| ------------ | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | ## EntityLinker.predict {#predict tag="method"} @@ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > entity_linker = EntityLinker(nlp.vocab) -> kb_ids, tensors = entity_linker.predict([doc1, doc2]) +> kb_ids = entity_linker.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | +| Name | Type | Description | +| ----------- | --------------- | ------------------------------------------------------------ | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. | ## EntityLinker.set_annotations {#set_annotations tag="method"} @@ -122,19 +125,18 @@ entities. > > ```python > entity_linker = EntityLinker(nlp.vocab) -> kb_ids, tensors = entity_linker.predict([doc1, doc2]) -> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) +> kb_ids = entity_linker.predict([doc1, doc2]) +> entity_linker.set_annotations([doc1, doc2], kb_ids) > ``` -| Name | Type | Description | -| --------- | -------- | ------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | -| `tensors` | iterable | The token representations used to predict the identifiers. | +| Name | Type | Description | +| -------- | --------------- | ------------------------------------------------------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | ## EntityLinker.update {#update tag="method"} -Learn from a batch of documents and gold-standard information, updating both the +Learn from a batch of [`Example`](/api/example) objects, updating both the pipe's entity linking model and context encoder. Delegates to [`predict`](/api/entitylinker#predict) and [`get_loss`](/api/entitylinker#get_loss). @@ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to > #### Example > > ```python -> entity_linker = EntityLinker(nlp.vocab) -> losses = {} +> entity_linker = EntityLinker(nlp.vocab, nel_model) > optimizer = nlp.begin_training() -> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) +> losses = entity_linker.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | ------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | -| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | - -## EntityLinker.get_loss {#get_loss tag="method"} - -Find the loss and gradient of loss for the entities in a batch of documents and -their predicted scores. - -> #### Example -> -> ```python -> entity_linker = EntityLinker(nlp.vocab) -> kb_ids, tensors = entity_linker.predict(docs) -> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors) -> ``` - -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `kb_ids` | iterable | KB identifiers representing the model's predictions. | -| `tensors` | iterable | The token representations used to predict the identifiers | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). | +| `sgd` | `Optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## EntityLinker.set_kb {#set_kb tag="method"} @@ -195,9 +177,9 @@ identifiers. ## EntityLinker.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. Before calling this method, a -knowledge base should have been defined with +Initialize the pipe for training, using data examples if available. Return an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this +method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). > #### Example @@ -209,12 +191,12 @@ knowledge base should have been defined with > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | +| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. | +| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. | +| **RETURNS** | `Optimizer` | An optimizer. | | ## EntityLinker.create_optimizer {#create_optimizer tag="method"} @@ -227,9 +209,9 @@ Create an optimizer for the pipeline component. > optimizer = entity_linker.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | -------- | -------------- | -| **RETURNS** | callable | The optimizer. | +| Name | Type | Description | +| ----------- | ----------- | --------------------------------------------------------------- | +| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ## EntityLinker.use_params {#use_params tag="method, contextmanager"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index c9a81f6f1..23cc71558 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline component is available in the [processing pipeline](/usage/processing-pipelines) via the ID `"ner"`. -## EntityRecognizer.Model {#model tag="classmethod"} +## Default config {#config} -Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API. Wrappers are under development for most major machine -learning libraries. +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. -| Name | Type | Description | -| ----------- | ------ | ------------------------------------- | -| `**kwargs` | - | Parameters for initializing the model | -| **RETURNS** | object | The initialized model. | +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg +``` ## EntityRecognizer.\_\_init\_\_ {#init tag="method"} -Create a new pipeline instance. In your application, you would normally use a -shortcut for this and instantiate the component using its string name and -[`nlp.create_pipe`](/api/language#create_pipe). - > #### Example > > ```python > # Construction via create_pipe > ner = nlp.create_pipe("ner") > -> # Construction from class +> # Construction via create_pipe with custom model +> config = {"model": {"@architectures": "my_ner"}} +> parser = nlp.create_pipe("ner", config) +> +> # Construction from class with custom model from file > from spacy.pipeline import EntityRecognizer -> ner = EntityRecognizer(nlp.vocab) -> ner.from_disk("/path/to/model") +> model = util.load_config("model.cfg", create_objects=True)["model"] +> ner = EntityRecognizer(nlp.vocab, model) > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `**cfg` | - | Configuration parameters. | -| **RETURNS** | `EntityRecognizer` | The newly constructed object. | +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.create_pipe`](/api/language#create_pipe). + +| Name | Type | Description | +| ----------- | ------------------ | ------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `**cfg` | - | Configuration parameters. | +| **RETURNS** | `EntityRecognizer` | The newly constructed object. | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} @@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and > pass > ``` -| Name | Type | Description | -| ------------ | -------- | ------------------------------------------------------ | -| `stream` | iterable | A stream of documents. | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Type | Description | +| ------------ | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | ## EntityRecognizer.predict {#predict tag="method"} @@ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > ner = EntityRecognizer(nlp.vocab) -> scores, tensors = ner.predict([doc1, doc2]) +> scores = ner.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to predict. | -| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | +| Name | Type | Description | +| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | ## EntityRecognizer.set_annotations {#set_annotations tag="method"} @@ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores. > > ```python > ner = EntityRecognizer(nlp.vocab) -> scores, tensors = ner.predict([doc1, doc2]) -> ner.set_annotations([doc1, doc2], scores, tensors) +> scores = ner.predict([doc1, doc2]) +> ner.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| --------- | -------- | ---------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | -| `tensors` | iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| -------- | ------------------ | ---------------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. | ## EntityRecognizer.update {#update tag="method"} -Learn from a batch of documents and gold-standard information, updating the -pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and +Learn from a batch of [`Example`](/api/example) objects, updating the pipe's +model. Delegates to [`predict`](/api/entityrecognizer#predict) and [`get_loss`](/api/entityrecognizer#get_loss). > #### Example > > ```python -> ner = EntityRecognizer(nlp.vocab) -> losses = {} +> ner = EntityRecognizer(nlp.vocab, ner_model) > optimizer = nlp.begin_training() -> ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) +> losses = ner.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | -------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate. | -| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | +| Name | Type | Description | +| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). | +| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## EntityRecognizer.get_loss {#get_loss tag="method"} @@ -157,21 +162,20 @@ predicted scores. > > ```python > ner = EntityRecognizer(nlp.vocab) -> scores = ner.predict([doc1, doc2]) -> loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores) +> scores = ner.predict([eg.predicted for eg in examples]) +> loss, d_loss = ner.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------- | +| `examples` | `Iterable[Example]` | The batch of examples. | +| `scores` | `List[StateClass]` | Scores representing the model's predictions. | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | ## EntityRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. +Initialize the pipe for training, using data examples if available. Return an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > @@ -181,12 +185,12 @@ has been initialized yet, the model is added. > optimizer = ner.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | +| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. | +| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. | +| **RETURNS** | `Optimizer` | An optimizer. | ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} @@ -199,9 +203,9 @@ Create an optimizer for the pipeline component. > optimizer = ner.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | -------- | -------------- | -| **RETURNS** | callable | The optimizer. | +| Name | Type | Description | +| ----------- | ----------- | --------------------------------------------------------------- | +| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ## EntityRecognizer.use_params {#use_params tag="method, contextmanager"} diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 9dabaf851..421828f95 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -1,10 +1,280 @@ --- title: Example -teaser: A training example +teaser: A training instance tag: class source: spacy/gold/example.pyx +new: 3.0 --- - +An `Example` holds the information for one training instance. It stores two +`Doc` objects: one for holding the gold-standard reference data, and one for +holding the predictions of the pipeline. An `Alignment` +object stores the alignment between these two documents, as they can differ in +tokenization. ## Example.\_\_init\_\_ {#init tag="method"} + +Construct an `Example` object from the `predicted` document and the `reference` +document. If `alignment` is `None`, it will be initialized from the words in +both documents. + +> #### Example +> +> ```python +> from spacy.tokens import Doc +> from spacy.gold import Example +> +> words = ["hello", "world", "!"] +> spaces = [True, False, False] +> predicted = Doc(nlp.vocab, words=words, spaces=spaces) +> reference = parse_gold_doc(my_data) +> example = Example(predicted, reference) +> ``` + +| Name | Type | Description | +| -------------- | ----------- | ------------------------------------------------------------------------------------------------ | +| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. | +| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | +| _keyword-only_ | | | +| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | +| **RETURNS** | `Example` | The newly constructed object. | + +## Example.from_dict {#from_dict tag="classmethod"} + +Construct an `Example` object from the `predicted` document and the reference +annotations provided as a dictionary. + + + +> #### Example +> +> ```python +> from spacy.tokens import Doc +> from spacy.gold import Example +> +> predicted = Doc(vocab, words=["Apply", "some", "sunscreen"]) +> token_ref = ["Apply", "some", "sun", "screen"] +> tags_ref = ["VERB", "DET", "NOUN", "NOUN"] +> example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref}) +> ``` + +| Name | Type | Description | +| -------------- | ---------------- | ----------------------------------------------------------------- | +| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. | +| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. | +| **RETURNS** | `Example` | The newly constructed object. | + +## Example.text {#text tag="property"} + +The text of the `predicted` document in this `Example`. + +> #### Example +> +> ```python +> raw_text = example.text +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------- | +| **RETURNS** | str | The text of the `predicted` document. | + +## Example.predicted {#predicted tag="property"} + +> #### Example +> +> ```python +> docs = [eg.predicted for eg in examples] +> predictions, _ = model.begin_update(docs) +> set_annotations(docs, predictions) +> ``` + +The `Doc` holding the predictions. Occassionally also refered to as `example.x`. + +| Name | Type | Description | +| ----------- | ----- | ---------------------------------------------- | +| **RETURNS** | `Doc` | The document containing (partial) predictions. | + +## Example.reference {#reference tag="property"} + +> #### Example +> +> ```python +> for i, eg in enumerate(examples): +> for j, label in enumerate(all_labels): +> gold_labels[i][j] = eg.reference.cats.get(label, 0.0) +> ``` + +The `Doc` holding the gold-standard annotations. Occassionally also refered to +as `example.y`. + +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------- | +| **RETURNS** | `Doc` | The document containing gold-standard annotations. | + +## Example.alignment {#alignment tag="property"} + +> #### Example +> +> ```python +> tokens_x = ["Apply", "some", "sunscreen"] +> x = Doc(vocab, words=tokens_x) +> tokens_y = ["Apply", "some", "sun", "screen"] +> example = Example.from_dict(x, {"words": tokens_y}) +> alignment = example.alignment +> assert list(alignment.y2x.data) == [[0], [1], [2], [2]] +> ``` + +The `Alignment` object mapping the tokens of the `predicted` document to those +of the `reference` document. + +| Name | Type | Description | +| ----------- | ----------- | -------------------------------------------------- | +| **RETURNS** | `Alignment` | The document containing gold-standard annotations. | + +## Example.get_aligned {#get_aligned tag="method"} + +> #### Example +> +> ```python +> predicted = Doc(vocab, words=["Apply", "some", "sunscreen"]) +> token_ref = ["Apply", "some", "sun", "screen"] +> tags_ref = ["VERB", "DET", "NOUN", "NOUN"] +> example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref}) +> assert example.get_aligned("TAG", as_string=True) == ["VERB", "DET", "NOUN"] +> ``` + +Get the aligned view of a certain token attribute, denoted by its int ID or +string name. + +| Name | Type | Description | Default | +| ----------- | -------------------------- | ------------------------------------------------------------------ | ------- | +| `field` | int or str | Attribute ID or string name | | +| `as_string` | bool | Whether or not to return the list of values as strings. | `False` | +| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | | + +## Example.get_aligned_parse {#get_aligned_parse tag="method"} + +> #### Example +> +> ```python +> doc = nlp("He pretty quickly walks away") +> example = Example.from_dict(doc, {"heads": [3, 2, 3, 0, 2]}) +> proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) +> assert proj_heads == [3, 2, 3, 0, 3] +> ``` + +Get the aligned view of the dependency parse. If `projectivize` is set to +`True`, non-projective dependency trees are made projective through the +Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005). + +| Name | Type | Description | Default | +| -------------- | -------------------------- | ------------------------------------------------------------------ | ------- | +| `projectivize` | bool | Whether or not to projectivize the dependency trees | `True` | +| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | | + +## Example.get_aligned_ner {#get_aligned_ner tag="method"} + +> #### Example +> +> ```python +> words = ["Mrs", "Smith", "flew", "to", "New York"] +> doc = Doc(en_vocab, words=words) +> entities = [(0, 9, "PERSON"), (18, 26, "LOC")] +> gold_words = ["Mrs Smith", "flew", "to", "New", "York"] +> example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) +> ner_tags = example.get_aligned_ner() +> assert ner_tags == ["B-PERSON", "L-PERSON", "O", "O", "U-LOC"] +> ``` + +Get the aligned view of the NER +[BILUO](/usage/linguistic-features#accessing-ner) tags. + +| Name | Type | Description | +| ----------- | ----------- | ----------------------------------------------------------------------------------- | +| **RETURNS** | `List[str]` | List of BILUO values, denoting whether tokens are part of an NER annotation or not. | + +## Example.get_aligned_spans_y2x {#get_aligned_spans_y2x tag="method"} + +> #### Example +> +> ```python +> words = ["Mr and Mrs Smith", "flew", "to", "New York"] +> doc = Doc(en_vocab, words=words) +> entities = [(0, 16, "PERSON")] +> tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "New", "York"] +> example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) +> ents_ref = example.reference.ents +> assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4)] +> ents_y2x = example.get_aligned_spans_y2x(ents_ref) +> assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)] +> ``` + +Get the aligned view of any set of [`Span`](/api/span) objects defined over +`example.reference`. The resulting span indices will align to the tokenization +in `example.predicted`. + +| Name | Type | Description | +| ----------- | ---------------- | --------------------------------------------------------------- | +| `y_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. | +| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. | + +## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"} + +> #### Example +> +> ```python +> nlp.add_pipe(my_ner) +> doc = nlp("Mr and Mrs Smith flew to New York") +> tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "New York"] +> example = Example.from_dict(doc, {"words": tokens_ref}) +> ents_pred = example.predicted.ents +> # Assume the NER model has found "Mr and Mrs Smith" as a named entity +> assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4)] +> ents_x2y = example.get_aligned_spans_x2y(ents_pred) +> assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)] +> ``` + +Get the aligned view of any set of [`Span`](/api/span) objects defined over +`example.predicted`. The resulting span indices will align to the tokenization +in `example.reference`. This method is particularly useful to assess the +accuracy of predicted entities against the original gold-standard annotation. + +| Name | Type | Description | +| ----------- | ---------------- | --------------------------------------------------------------- | +| `x_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. | +| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. | + +## Example.to_dict {#to_dict tag="method"} + +Return a dictionary representation of the reference annotation contained in this +`Example`. + +> #### Example +> +> ```python +> eg_dict = example.to_dict() +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | ------------------------------------------------------ | +| **RETURNS** | `Dict[str, obj]` | Dictionary representation of the reference annotation. | + +## Example.split_sents {#split_sents tag="method"} + +> #### Example +> +> ```python +> doc = nlp("I went yesterday had lots of fun") +> tokens_ref = ["I", "went", "yesterday", "had", "lots", "of", "fun"] +> sents_ref = [True, False, False, True, False, False, False] +> example = Example.from_dict(doc, {"words": tokens_ref, "sent_starts": sents_ref}) +> split_examples = example.split_sents() +> assert split_examples[0].text == "I went yesterday " +> assert split_examples[1].text == "had lots of fun" +> ``` + +Split one `Example` into multiple `Example` objects, one for each sentence. + +| Name | Type | Description | +| ----------- | --------------- | ---------------------------------------------------------- | +| **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index e835168b7..3ba93b360 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -52,7 +52,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved. | Name | Type | Description | | ----------- | ----- | --------------------------------------------------------------------------------- | | `text` | str | The text to be processed. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | **RETURNS** | `Doc` | A container for accessing the annotations. | ## Language.pipe {#pipe tag="method"} @@ -68,15 +68,15 @@ more efficient than processing texts one-by-one. > assert doc.is_parsed > ``` -| Name | Type | Description | -| -------------------------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | iterable | A sequence of strings. | -| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | -| `batch_size` | int | The number of texts to buffer. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | -| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | -| **YIELDS** | `Doc` | Documents in the order of the original text. | +| Name | Type | Description | +| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | `Iterable[str]` | A sequence of strings. | +| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | +| `batch_size` | int | The number of texts to buffer. | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | +| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | +| **YIELDS** | `Doc` | Documents in the order of the original text. | ## Language.update {#update tag="method"} @@ -87,18 +87,19 @@ Update the models in the pipeline. > ```python > for raw_text, entity_offsets in train_data: > doc = nlp.make_doc(raw_text) -> gold = GoldParse(doc, entities=entity_offsets) -> nlp.update([doc], [gold], drop=0.5, sgd=optimizer) +> example = Example.from_dict(doc, {"entities": entity_offsets}) +> nlp.update([example], sgd=optimizer) > ``` -| Name | Type | Description | -| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of `Doc` objects or strings. If strings, a `Doc` object will be created from the text. | -| `golds` | iterable | A batch of `GoldParse` objects or dictionaries. Dictionaries will be used to create [`GoldParse`](/api/goldparse) objects. For the available keys and their usage, see [`GoldParse.__init__`](/api/goldparse#init). | -| `drop` | float | The dropout rate. | -| `sgd` | callable | An optimizer. | -| `losses` | dict | Dictionary to update with the loss, keyed by pipeline component. | -| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | +| Name | Type | Description | +| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | +| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | +| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## Language.evaluate {#evaluate tag="method"} @@ -107,35 +108,37 @@ Evaluate a model's pipeline components. > #### Example > > ```python -> scorer = nlp.evaluate(docs_golds, verbose=True) +> scorer = nlp.evaluate(examples, verbose=True) > print(scorer.scores) > ``` -| Name | Type | Description | -| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects, such that the `Doc` objects contain the predictions and the `GoldParse` objects the correct annotations. Alternatively, `(text, annotations)` tuples of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). | -| `verbose` | bool | Print debugging information. | -| `batch_size` | int | The batch size to use. | -| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | -| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | Scorer | The scorer containing the evaluation scores. | +| Name | Type | Description | +| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| `verbose` | bool | Print debugging information. | +| `batch_size` | int | The batch size to use. | +| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | +| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | +| **RETURNS** | Scorer | The scorer containing the evaluation scores. | ## Language.begin_training {#begin_training tag="method"} -Allocate models, pre-process training data and acquire an optimizer. +Allocate models, pre-process training data and acquire an +[`Optimizer`](https://thinc.ai/docs/api-optimizers). > #### Example > > ```python -> optimizer = nlp.begin_training(gold_tuples) +> optimizer = nlp.begin_training(get_examples) > ``` -| Name | Type | Description | -| -------------------------------------------- | -------- | ---------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Gold-standard training data. | -| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | -| `**cfg` | - | Config parameters (sent to all components). | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ | +| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | +| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. | +| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | +| `**cfg` | - | Config parameters (sent to all components). | +| **RETURNS** | `Optimizer` | An optimizer. | ## Language.use_params {#use_params tag="contextmanager, method"} @@ -155,16 +158,6 @@ their original weights after the block. | `params` | dict | A dictionary of parameters keyed by model ID. | | `**cfg` | - | Config parameters. | -## Language.preprocess_gold {#preprocess_gold tag="method"} - -Can be called before training to pre-process gold data. By default, it handles -nonprojectivity and adds missing tags to the tag map. - -| Name | Type | Description | -| ------------ | -------- | ---------------------------------------- | -| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects. | -| **YIELDS** | tuple | Tuples of `Doc` and `GoldParse` objects. | - ## Language.create_pipe {#create_pipe tag="method" new="2"} Create a pipeline component from a factory. diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md new file mode 100644 index 000000000..8761ee903 --- /dev/null +++ b/website/docs/api/morphologizer.md @@ -0,0 +1,23 @@ +--- +title: Morphologizer +tag: class +source: spacy/pipeline/morphologizer.pyx +new: 3 +--- + +A trainable pipeline component to predict morphological features. This class is +a subclass of `Pipe` and follows the same API. The component is also available +via the string name `"morphologizer"`. After initialization, it is typically +added to the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). + +## Default config {#config} + +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. + +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/morphologizer_defaults.cfg +``` diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 8ad735e0d..cd720d26c 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -27,22 +27,20 @@ Create a new `Scorer`. ## Scorer.score {#score tag="method"} -Update the evaluation scores from a single [`Doc`](/api/doc) / -[`GoldParse`](/api/goldparse) pair. +Update the evaluation scores from a single [`Example`](/api/example) object. > #### Example > > ```python > scorer = Scorer() -> scorer.score(doc, gold) +> scorer.score(example) > ``` -| Name | Type | Description | -| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The predicted annotations. | -| `gold` | `GoldParse` | The correct annotations. | -| `verbose` | bool | Print debugging information. | -| `punct_labels` | tuple | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. | +| Name | Type | Description | +| -------------- | --------- | -------------------------------------------------------------------------------------------------------------------- | +| `example` | `Example` | The `Example` object holding both the predictions and the correct gold-standard annotations. | +| `verbose` | bool | Print debugging information. | +| `punct_labels` | tuple | Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is `True`. | ## Properties diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 367b79e5d..458e42975 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -11,6 +11,18 @@ subclass of `Pipe` and follows the same API. The component is also available via the string name `"senter"`. After initialization, it is typically added to the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). +## Default config {#config} + +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. + +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/senter_defaults.cfg +``` + ## SentenceRecognizer.\_\_init\_\_ {#init tag="method"} Initialize the sentence recognizer. diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index f14da3ac5..9ef0843cf 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -8,41 +8,34 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline component is available in the [processing pipeline](/usage/processing-pipelines) via the ID `"tagger"`. -## Tagger.Model {#model tag="classmethod"} - -Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API. Wrappers are under development for most major machine -learning libraries. - -| Name | Type | Description | -| ----------- | ------ | ------------------------------------- | -| `**kwargs` | - | Parameters for initializing the model | -| **RETURNS** | object | The initialized model. | - ## Tagger.\_\_init\_\_ {#init tag="method"} -Create a new pipeline instance. In your application, you would normally use a -shortcut for this and instantiate the component using its string name and -[`nlp.create_pipe`](/api/language#create_pipe). - > #### Example > > ```python > # Construction via create_pipe > tagger = nlp.create_pipe("tagger") > -> # Construction from class +> # Construction via create_pipe with custom model +> config = {"model": {"@architectures": "my_tagger"}} +> parser = nlp.create_pipe("tagger", config) +> +> # Construction from class with custom model from file > from spacy.pipeline import Tagger -> tagger = Tagger(nlp.vocab) -> tagger.from_disk("/path/to/model") +> model = util.load_config("model.cfg", create_objects=True)["model"] +> tagger = Tagger(nlp.vocab, model) > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `**cfg` | - | Configuration parameters. | -| **RETURNS** | `Tagger` | The newly constructed object. | +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.create_pipe`](/api/language#create_pipe). + +| Name | Type | Description | +| ----------- | -------- | ------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `**cfg` | - | Configuration parameters. | +| **RETURNS** | `Tagger` | The newly constructed object. | ## Tagger.\_\_call\_\_ {#call tag="method"} @@ -83,11 +76,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and > pass > ``` -| Name | Type | Description | -| ------------ | -------- | ------------------------------------------------------ | -| `stream` | iterable | A stream of documents. | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Type | Description | +| ------------ | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | ## Tagger.predict {#predict tag="method"} @@ -97,13 +90,13 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > tagger = Tagger(nlp.vocab) -> scores, tensors = tagger.predict([doc1, doc2]) +> scores = tagger.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. | +| Name | Type | Description | +| ----------- | --------------- | ----------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | - | The model's prediction for each document. | ## Tagger.set_annotations {#set_annotations tag="method"} @@ -113,15 +106,14 @@ Modify a batch of documents, using pre-computed scores. > > ```python > tagger = Tagger(nlp.vocab) -> scores, tensors = tagger.predict([doc1, doc2]) -> tagger.set_annotations([doc1, doc2], scores, tensors) +> scores = tagger.predict([doc1, doc2]) +> tagger.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| --------- | -------- | ----------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `Tagger.predict`. | -| `tensors` | iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| -------- | --------------- | ------------------------------------------------ | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | - | The scores to set, produced by `Tagger.predict`. | ## Tagger.update {#update tag="method"} @@ -132,19 +124,20 @@ pipe's model. Delegates to [`predict`](/api/tagger#predict) and > #### Example > > ```python -> tagger = Tagger(nlp.vocab) -> losses = {} +> tagger = Tagger(nlp.vocab, tagger_model) > optimizer = nlp.begin_training() -> tagger.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) +> losses = tagger.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | -------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate. | -| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | +| Name | Type | Description | +| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). | +| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## Tagger.get_loss {#get_loss tag="method"} @@ -155,21 +148,20 @@ predicted scores. > > ```python > tagger = Tagger(nlp.vocab) -> scores = tagger.predict([doc1, doc2]) -> loss, d_loss = tagger.get_loss([doc1, doc2], [gold1, gold2], scores) +> scores = tagger.predict([eg.predicted for eg in examples]) +> loss, d_loss = tagger.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------- | +| `examples` | `Iterable[Example]` | The batch of examples. | +| `scores` | - | Scores representing the model's predictions. | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | ## Tagger.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. +Initialize the pipe for training, using data examples if available. Return an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > @@ -179,12 +171,12 @@ has been initialized yet, the model is added. > optimizer = tagger.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`Tagger`](/api/tagger#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | +| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. | +| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. | +| **RETURNS** | `Optimizer` | An optimizer. | ## Tagger.create_optimizer {#create_optimizer tag="method"} @@ -197,9 +189,9 @@ Create an optimizer for the pipeline component. > optimizer = tagger.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | -------- | -------------- | -| **RETURNS** | callable | The optimizer. | +| Name | Type | Description | +| ----------- | ----------- | --------------------------------------------------------------- | +| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ## Tagger.use_params {#use_params tag="method, contextmanager"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index dc1c083ac..431ee683b 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -9,44 +9,50 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline component is available in the [processing pipeline](/usage/processing-pipelines) via the ID `"textcat"`. -## TextCategorizer.Model {#model tag="classmethod"} +## Default config {#config} -Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API. Wrappers are under development for most major machine -learning libraries. +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. -| Name | Type | Description | -| ----------- | ------ | ------------------------------------- | -| `**kwargs` | - | Parameters for initializing the model | -| **RETURNS** | object | The initialized model. | +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/textcat_defaults.cfg +``` + + ## TextCategorizer.\_\_init\_\_ {#init tag="method"} -Create a new pipeline instance. In your application, you would normally use a -shortcut for this and instantiate the component using its string name and -[`nlp.create_pipe`](/api/language#create_pipe). - > #### Example > > ```python > # Construction via create_pipe > textcat = nlp.create_pipe("textcat") -> textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True}) > -> # Construction from class +> # Construction via create_pipe with custom model +> config = {"model": {"@architectures": "my_textcat"}} +> parser = nlp.create_pipe("textcat", config) +> +> # Construction from class with custom model from file > from spacy.pipeline import TextCategorizer -> textcat = TextCategorizer(nlp.vocab) -> textcat.from_disk("/path/to/model") +> model = util.load_config("model.cfg", create_objects=True)["model"] +> textcat = TextCategorizer(nlp.vocab, model) > ``` -| Name | Type | Description | -| ------------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. | -| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | -| **RETURNS** | `TextCategorizer` | The newly constructed object. | +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.create_pipe`](/api/language#create_pipe). +| Name | Type | Description | +| ----------- | ----------------- | ------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `**cfg` | - | Configuration parameters. | +| **RETURNS** | `TextCategorizer` | The newly constructed object. | + + ## TextCategorizer.\_\_call\_\_ {#call tag="method"} @@ -102,11 +109,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and > pass > ``` -| Name | Type | Description | -| ------------ | -------- | ------------------------------------------------------ | -| `stream` | iterable | A stream of documents. | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Type | Description | +| ------------ | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | ## TextCategorizer.predict {#predict tag="method"} @@ -116,13 +123,13 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > textcat = TextCategorizer(nlp.vocab) -> scores, tensors = textcat.predict([doc1, doc2]) +> scores = textcat.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. | +| Name | Type | Description | +| ----------- | --------------- | ----------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | - | The model's prediction for each document. | ## TextCategorizer.set_annotations {#set_annotations tag="method"} @@ -132,15 +139,14 @@ Modify a batch of documents, using pre-computed scores. > > ```python > textcat = TextCategorizer(nlp.vocab) -> scores, tensors = textcat.predict([doc1, doc2]) -> textcat.set_annotations([doc1, doc2], scores, tensors) +> scores = textcat.predict(docs) +> textcat.set_annotations(docs, scores) > ``` -| Name | Type | Description | -| --------- | -------- | --------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | -| `tensors` | iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| -------- | --------------- | --------------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | ## TextCategorizer.update {#update tag="method"} @@ -151,19 +157,20 @@ pipe's model. Delegates to [`predict`](/api/textcategorizer#predict) and > #### Example > > ```python -> textcat = TextCategorizer(nlp.vocab) -> losses = {} +> textcat = TextCategorizer(nlp.vocab, textcat_model) > optimizer = nlp.begin_training() -> textcat.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) +> losses = textcat.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | -------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate. | -| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | +| Name | Type | Description | +| ----------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). | +| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## TextCategorizer.get_loss {#get_loss tag="method"} @@ -174,21 +181,20 @@ predicted scores. > > ```python > textcat = TextCategorizer(nlp.vocab) -> scores = textcat.predict([doc1, doc2]) -> loss, d_loss = textcat.get_loss([doc1, doc2], [gold1, gold2], scores) +> scores = textcat.predict([eg.predicted for eg in examples]) +> loss, d_loss = textcat.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------- | ------------------- | --------------------------------------------------- | +| `examples` | `Iterable[Example]` | The batch of examples. | +| `scores` | - | Scores representing the model's predictions. | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | ## TextCategorizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. +Initialize the pipe for training, using data examples if available. Return an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > @@ -198,12 +204,12 @@ has been initialized yet, the model is added. > optimizer = textcat.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`TextCategorizer`](/api/textcategorizer#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| -------------- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | +| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. | +| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. | +| **RETURNS** | `Optimizer` | An optimizer. | ## TextCategorizer.create_optimizer {#create_optimizer tag="method"} @@ -216,9 +222,9 @@ Create an optimizer for the pipeline component. > optimizer = textcat.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | -------- | -------------- | -| **RETURNS** | callable | The optimizer. | +| Name | Type | Description | +| ----------- | ----------- | --------------------------------------------------------------- | +| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ## TextCategorizer.use_params {#use_params tag="method, contextmanager"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md new file mode 100644 index 000000000..3667ed8ad --- /dev/null +++ b/website/docs/api/tok2vec.md @@ -0,0 +1,19 @@ +--- +title: Tok2Vec +source: spacy/pipeline/tok2vec.py +new: 3 +--- + +TODO: document + +## Default config {#config} + +This is the default configuration used to initialize the model powering the +pipeline component. See the [model architectures](/api/architectures) +documentation for details on the architectures and their arguments and +hyperparameters. To learn more about how to customize the config and train +custom models, check out the [training config](/usage/training#config) docs. + +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/tok2vec_defaults.cfg +``` diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 9094b46d3..8e9fff6aa 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -3,6 +3,7 @@ title: Top-level Functions menu: - ['spacy', 'spacy'] - ['displacy', 'displacy'] + - ['registry', 'registry'] - ['Data & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -33,7 +34,7 @@ loaded in via [`Language.from_disk`](/api/language#from_disk). | Name | Type | Description | | ----------- | ------------ | --------------------------------------------------------------------------------- | | `name` | str / `Path` | Model to load, i.e. package name or path. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | **RETURNS** | `Language` | A `Language` object with the loaded model. | Essentially, `spacy.load()` is a convenience wrapper that reads the language ID @@ -60,11 +61,11 @@ Create a blank model of a given language class. This function is the twin of > nlp_de = spacy.blank("de") > ``` -| Name | Type | Description | -| ----------- | ---------- | ------------------------------------------------------------------------------------------------ | -| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | +| Name | Type | Description | +| ----------- | ----------- | ------------------------------------------------------------------------------------------------ | +| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | #### spacy.info {#spacy.info tag="function"} @@ -259,6 +260,48 @@ package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +## registry {#registry source="spacy/util.py" new="3"} + +spaCy's function registry extends +[Thinc's `registry`](https://thinc.ai/docs/api-config#registry) and allows you +to map strings to functions. You can register functions to create architectures, +optimizers, schedules and more, and then refer to them and set their arguments +in your [config file](/usage/training#config). Python type hints are used to +validate the inputs. See the +[Thinc docs](https://thinc.ai/docs/api-config#registry) for details on the +`registry` methods and our helper library +[`catalogue`](https://github.com/explosion/catalogue) for some background on the +concept of function registries. spaCy also uses the function registry for +language subclasses, model architecture, lookups and pipeline component +factories. + + + +> #### Example +> +> ```python +> import spacy +> from thinc.api import Model +> +> @spacy.registry.architectures("CustomNER.v1") +> def custom_ner(n0: int) -> Model: +> return Model("custom", forward, dims={"nO": nO}) +> ``` + +| Registry name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | +| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points) | +| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | +| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `assets` | | +| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | +| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | +| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | +| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | +| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | + ## Training data and alignment {#gold source="spacy/gold"} ### gold.docs_to_json {#docs_to_json tag="function"} @@ -421,6 +464,8 @@ page should be safe to use and we'll try to ensure backwards compatibility. However, we recommend having additional tests in place if your application depends on any of spaCy's utilities. + + ### util.get_lang_class {#util.get_lang_class tag="function"} Import and load a `Language` class. Allows lazy-loading @@ -674,8 +719,7 @@ vary on each step. > ```python > batches = minibatch(train_data) > for batch in batches: -> texts, annotations = zip(*batch) -> nlp.update(texts, annotations) +> nlp.update(batch) > ``` | Name | Type | Description | @@ -705,7 +749,7 @@ of one entity) or when merging spans with | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | -## util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} +### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} diff --git a/website/docs/images/training-loop.svg b/website/docs/images/training-loop.svg index e883b36be..144fe2d3d 100644 --- a/website/docs/images/training-loop.svg +++ b/website/docs/images/training-loop.svg @@ -26,7 +26,7 @@ - GoldParse + Example diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 10910b93b..b25e46f1e 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -3,7 +3,6 @@ title: Models teaser: Downloadable pretrained models for spaCy menu: - ['Quickstart', 'quickstart'] - - ['Model Architecture', 'architecture'] - ['Conventions', 'conventions'] --- @@ -27,7 +26,7 @@ import QuickstartModels from 'widgets/quickstart-models.js' - + For more details on how to use models with spaCy, see the [usage guide on models](/usage/models). diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 4363b9b4f..95158b67d 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -45,10 +45,11 @@ an **annotated document**. It also orchestrates training and serialization. ### Other classes {#architecture-other} -| Name | Description | -| --------------------------------- | ------------------------------------------------------------------------------------------------------------- | -| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. | -| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | -| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | -| [`GoldParse`](/api/goldparse) | Collection for training annotations. | -| [`GoldCorpus`](/api/goldcorpus) | An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER. | +| Name | Description | +| --------------------------------- | ----------------------------------------------------------------------------- | +| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. | +| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | +| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | +| [`Example`](/api/example) | Collection for training annotations. | + +| diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 99612a6bb..9c028ce61 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -28,7 +28,7 @@ import PosDeps101 from 'usage/101/\_pos-deps.md' - + For a list of the fine-grained and coarse-grained part-of-speech tags assigned by spaCy's models across different languages, see the label schemes documented @@ -287,7 +287,7 @@ for token in doc: | their | `ADJ` | `poss` | requests | | requests | `NOUN` | `dobj` | submit | - + For a list of the syntactic dependency labels assigned by spaCy's models across different languages, see the label schemes documented in the @@ -615,7 +615,7 @@ tokens containing periods intact (abbreviations like "U.S."). ![Language data architecture](../images/language_data.svg) - + For more details on the language-specific data, see the usage guide on [adding languages](/usage/adding-languages). diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 8157e2c07..4c8bc1664 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -338,7 +338,7 @@ nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory doc = nlp("This is a sentence.") ``` - + You can use the [`info`](/api/cli#info) command or [`spacy.info()`](/api/top-level#spacy.info) method to print a model's meta data diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 32d6bf7a2..fc335ac5d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -34,7 +34,7 @@ texts = ["This is a text", "These are lots of texts", "..."] + docs = list(nlp.pipe(texts)) ``` - + - Process the texts **as a stream** using [`nlp.pipe`](/api/language#pipe) and buffer them in batches, instead of one-by-one. This is usually much more @@ -912,7 +912,7 @@ new_heads = [head - i - 1 if head != 0 else 0 for i, head in enumerate(heads)] - + For more details on how to write and package custom components, make them available to spaCy via entry points and implement your own serialization diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 2631f1438..c56044be0 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -1,5 +1,664 @@ --- title: Projects +new: 3 +menu: + - ['Intro & Workflow', 'intro'] + - ['Directory & Assets', 'directory'] + - ['Custom Projects', 'custom'] + - ['Integrations', 'integrations'] --- -TODO: write +> #### 🪐 Project templates +> +> Our [`projects`](https://github.com/explosion/projects) repo includes various +> project templates for different NLP tasks, models, workflows and integrations +> that you can clone and run. The easiest way to get started is to pick a +> template, clone it and start modifying it! + +spaCy projects let you manage and share **end-to-end spaCy workflows** for +different **use cases and domains**, and orchestrate training, packaging and +serving your custom models. You can start off by cloning a pre-defined project +template, adjust it to fit your needs, load in your data, train a model, export +it as a Python package and share the project templates with your team. spaCy +projects can be used via the new [`spacy project`](/api/cli#project) command. +For an overview of the available project templates, check out the +[`projects`](https://github.com/explosion/projects) repo. spaCy projects also +[integrate](#integrations) with many other cool machine learning and data +science tools to track and manage your data and experiments, iterate on demos +and prototypes and ship your models into production. + + + +## Introduction and workflow {#intro} + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +spaCy projects make it easy to integrate with many other **awesome tools** in +the data science and machine learning ecosystem to track and manage your data +and experiments, iterate on demos and prototypes and ship your models into +production. + + +Manage and version your data +Create labelled training data +Visualize and demo your models +Serve your models and host APIs +Distributed and parallel training +Track your experiments and results + + +### 1. Clone a project template {#clone} + +> #### Cloning under the hood +> +> To clone a project, spaCy calls into `git` and uses the "sparse checkout" +> feature to only clone the relevant directory or directories. + +The [`spacy project clone`](/api/cli#project-clone) command clones an existing +project template and copies the files to a local directory. You can then run the +project, e.g. to train a model and edit the commands and scripts to build fully +custom workflows. + +```bash +$ python -m spacy clone some_example_project +``` + +By default, the project will be cloned into the current working directory. You +can specify an optional second argument to define the output directory. The +`--repo` option lets you define a custom repo to clone from, if you don't want +to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You +can also use any private repo you have access to with Git. + +### 2. Fetch the project assets {#assets} + +> #### project.yml +> +> ```yaml +> assets: +> - dest: 'assets/training.spacy' +> url: 'https://example.com/data.spacy' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> ``` + +Assets are data files your project needs – for example, the training and +evaluation data or pretrained vectors and embeddings to initialize your model +with. Each project template comes with a `project.yml` that defines the assets +to download and where to put them. The +[`spacy project assets`](/api/cli#project-assets) will fetch the project assets +for you: + +```bash +cd some_example_project +python -m spacy project assets +``` + +### 3. Run a command {#run} + +> #### project.yml +> +> ```yaml +> commands: +> - name: preprocess +> help: "Convert the input data to spaCy's format" +> script: +> - 'python -m spacy convert assets/train.conllu corpus/' +> - 'python -m spacy convert assets/eval.conllu corpus/' +> deps: +> - 'assets/train.conllu' +> - 'assets/eval.conllu' +> outputs: +> - 'corpus/train.spacy' +> - 'corpus/eval.spacy' +> ``` + +Commands consist of one or more steps and can be run with +[`spacy project run`](/api/cli#project-run). The following will run the command +`preprocess` defined in the `project.yml`: + +```bash +$ python -m spacy project run preprocess +``` + +Commands can define their expected [dependencies and outputs](#deps-outputs) +using the `deps` (files the commands require) and `outputs` (files the commands +create) keys. This allows your project to track changes and determine whether a +command needs to be re-run. For instance, if your input data changes, you want +to re-run the `preprocess` command. But if nothing changed, this step can be +skipped. You can also set `--force` to force re-running a command, or `--dry` to +perform a "dry run" and see what would happen (without actually running the +script). + +### 4. Run a workflow {#run-workfow} + +> #### project.yml +> +> ```yaml +> workflows: +> all: +> - preprocess +> - train +> - package +> ``` + +Workflows are series of commands that are run in order and often depend on each +other. For instance, to generate a packaged model, you might start by converting +your data, then run [`spacy train`](/api/cli#train) to train your model on the +converted data and if that's successful, run [`spacy package`](/api/cli#package) +to turn the best model artifact into an installable Python package. The +following command run the workflow named `all` defined in the `project.yml`, and +execute the commands it specifies, in order: + +```bash +$ python -m spacy project run all +``` + +Using the expected [dependencies and outputs](#deps-outputs) defined in the +commands, spaCy can determine whether to re-run a command (if its inputs or +outputs have changed) or whether to skip it. If you're looking to implement more +advanced data pipelines and track your changes in Git, check out the +[Data Version Control (DVC) integration](#dvc). The +[`spacy project dvc`](/api/cli#project-dvc) command generates a DVC config file +from a workflow defined in your `project.yml` so you can manage your spaCy +project as a DVC repo. + +## Project directory and assets {#directory} + +### project.yml {#project-yml} + +The `project.yml` defines the assets a project depends on, like datasets and +pretrained weights, as well as a series of commands that can be run separately +or as a workflow – for instance, to preprocess the data, convert it to spaCy's +format, train a model, evaluate it and export metrics, package it and spin up a +quick web demo. It looks pretty similar to a config file used to define CI +pipelines. + + + +```yaml +https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.yml +``` + +| Section | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. | +| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | +| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. | +| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | +| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | + +### Dependencies and outputs {#deps-outputs} + +Each command defined in the `project.yml` can optionally define a list of +dependencies and outputs. These are the files the commands requires and creates. +For example, a command for training a model may depend on a +[`config.cfg`](/usage/training#config) and the training and evaluation data, and +it will export a directory `model-best`, containing the best model, which you +can then re-use in other commands. + + +```yaml +### project.yml +commands: + - name: train + help: 'Train a spaCy model using the specified corpus and config' + script: + - 'python -m spacy train ./corpus/training.spacy ./corpus/evaluation.spacy ./configs/config.cfg -o training/' + deps: + - 'configs/config.cfg' + - 'corpus/training.spacy' + - 'corpus/evaluation.spacy' + outputs: + - 'training/model-best' +``` + +> #### Re-running vs. skipping +> +> Under the hood, spaCy uses a `project.lock` lockfile that stores the details +> for each command, as well as its dependencies and outputs and their checksums. +> It's updated on each run. If any of this information changes, the command will +> be re-run. Otherwise, it will be skipped. + +If you're running a command and it depends on files that are missing, spaCy will +show you an error. If a command defines dependencies and outputs that haven't +changed since the last run, the command will be skipped. This means that you're +only re-running commands if they need to be re-run. Commands can also set +`no_skip: true` if they should never be skipped – for example commands that run +tests. Commands without outputs are also never skipped. To force re-running a +command or workflow, even if nothing changed, you can set the `--force` flag. + +Note that [`spacy project`](/api/cli#project) doesn't compile any dependency +graphs based on the dependencies and outputs, and won't re-run previous steps +automatically. For instance, if you only run the command `train` that depends on +data created by `preprocess` and those files are missing, spaCy will show an +error – it won't just re-run `preprocess`. If you're looking for more advanced +data management, check out the [Data Version Control (DVC) integration](#dvc) +integration. If you're planning on integrating your spaCy project with DVC, you +can also use `outputs_no_cache` instead of `outputs` to define outputs that +won't be cached or tracked. + +### Files and directory structure {#project-files} + +The `project.yml` can define a list of `directories` that should be created +within a project – for instance, `assets`, `training`, `corpus` and so on. spaCy +will make sure that these directories are always available, so your commands can +write to and read from them. Project directories will also include all files and +directories copied from the project template with +[`spacy project clone`](/api/cli#project-clone). Here's an example of a project +directory: + +> #### project.yml +> +> +> ```yaml +> directories: ['assets', 'configs', 'corpus', 'metas', 'metrics', 'notebooks', 'packages', 'scripts', 'training'] +> ``` + +```yaml +### Example project directory +├── project.yml # the project settings +├── project.lock # lockfile that tracks inputs/outputs +├── assets/ # downloaded data assets +├── configs/ # model config.cfg files used for training +├── corpus/ # output directory for training corpus +├── metas/ # model meta.json templates used for packaging +├── metrics/ # output directory for evaluation metrics +├── notebooks/ # directory for Jupyter notebooks +├── packages/ # output directory for model Python packages +├── scripts/ # directory for scripts, e.g. referenced in commands +├── training/ # output directory for trained models +└── ... # any other files, like a requirements.txt etc. +``` + +If you don't want a project to create a directory, you can delete it and remove +its entry from the `project.yml` – just make sure it's not required by any of +the commands. [Custom templates](#custom) can use any directories they need – +the only file that's required for a project is the `project.yml`. + +--- + +## Custom scripts and projects {#custom} + +The `project.yml` lets you define any custom commands and run them as part of +your training, evaluation or deployment workflows. The `script` section defines +a list of commands that are called in a subprocess, in order. This lets you +execute other Python scripts or command-line tools. Let's say you've written a +few integration tests that load the best model produced by the training command +and check that it works correctly. You can now define a `test` command that +calls into [`pytest`](https://docs.pytest.org/en/latest/), runs your tests and +uses [`pytest-html`](https://github.com/pytest-dev/pytest-html) to export a test +report: + +```yaml +### project.yml +commands: + - name: test + help: 'Test the trained model' + script: + - 'pip install pytest pytest-html' + - 'python -m pytest ./scripts/tests --html=metrics/test-report.html' + deps: + - 'training/model-best' + outputs: + - 'metrics/test-report.html' + no_skip: true +``` + +Adding `training/model-best` to the command's `deps` lets you ensure that the +file is available. If not, spaCy will show an error and the command won't run. +Setting `no_skip: true` means that the command will always run, even if the +dependencies (the trained model) hasn't changed. This makes sense here, because +you typically don't want to skip your tests. + +### Writing custom scripts {#custom-scripts} + +Your project commands can include any custom scripts – essentially, anything you +can run from the command line. Here's an example of a custom script that uses +[`typer`](https://typer.tiangolo.com/) for quick and easy command-line arguments +that you can define via your `project.yml`: + +> #### About Typer +> +> [`typer`](https://typer.tiangolo.com/) is a modern library for building Python +> CLIs using type hints. It's a dependency of spaCy, so it will already be +> pre-installed in your environment. Function arguments automatically become +> positional CLI arguments and using Python type hints, you can define the value +> types. For instance, `batch_size: int` means that the value provided via the +> command line is converted to an integer. + +```python +### scripts/custom_evaluation.py +import typer + +def custom_evaluation(batch_size: int = 128, model_path: str, data_path: str): + # The arguments are now available as positional CLI arguments + print(batch_size, model_path, data_path) + +if __name__ == "__main__": + typer.run(custom_evaluation) +``` + +In your `project.yml`, you can then run the script by calling +`python scripts/custom_evaluation.py` with the function arguments. You can also +use the `variables` section to define reusable variables that will be +substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is +defined as a variable will be added in place of `{BATCH_SIZE}` in the script. + +> #### Calling into Python +> +> If any of your command scripts call into `python`, spaCy will take care of +> replacing that with your `sys.executable`, to make sure you're executing +> everything with the same Python (not some other Python installed on your +> system). It also normalizes references to `python3`, `pip3` and `pip`. + + +```yaml +### project.yml +variables: + BATCH_SIZE: 128 + +commands: + - name: evaluate + script: + - 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json' + deps: + - 'training/model-best' + - 'corpus/eval.json' +``` + +### Cloning from your own repo {#custom-repo} + +The [`spacy project clone`](/api/cli#project-clone) command lets you customize +the repo to clone from using the `--repo` option. It calls into `git`, so you'll +be able to clone from any repo that you have access to, including private repos. + +```bash +$ python -m spacy project your_project --repo https://github.com/you/repo +``` + +At a minimum, a valid project template needs to contain a +[`project.yml`](#project-yml). It can also include +[other files](/usage/projects#project-files), like custom scripts, a +`requirements.txt` listing additional dependencies, +[training configs](/usage/training#config) and model meta templates, or Jupyter +notebooks with usage examples. + + + +It's typically not a good idea to check large data assets, trained models or +other artifacts into a Git repo and you should exclude them from your project +template by adding a `.gitignore`. If you want to version your data and models, +check out [Data Version Control](#dvc) (DVC), which integrates with spaCy +projects. + + + +### Working with private assets {#private-assets} + +For many projects, the datasets and weights you're working with might be +company-internal and not available via a public URL. In that case, you can +specify the destination paths and a checksum, and leave out the URL. When your +teammates clone and run your project, they can place the files in the respective +directory themselves. The [`spacy project assets`](/api/cli#project-assets) +command will alert about missing files and mismatched checksums, so you can +ensure that others are running your project with the same data. + +```yaml +### project.yml +assets: + - dest: 'assets/private_training_data.json' + checksum: '63373dd656daa1fd3043ce166a59474c' + - dest: 'assets/private_vectors.bin' + checksum: '5113dc04e03f079525edd8df3f4f39e3' +``` + +## Integrations {#integrations} + +### Data Version Control (DVC) {#dvc} + +Data assets like training corpora or pretrained weights are at the core of any +NLP project, but they're often difficult to manage: you can't just check them +into your Git repo to version and keep track of them. And if you have multiple +steps that depend on each other, like a preprocessing step that generates your +training data, you need to make sure the data is always up-to-date, and re-run +all steps of your process every time, just to be safe. + +[Data Version Control](https://dvc.org) (DVC) is a standalone open-source tool +that integrates into your workflow like Git, builds a dependency graph for your +data pipelines and tracks and caches your data files. If you're downloading data +from an external source, like a storage bucket, DVC can tell whether the +resource has changed. It can also determine whether to re-run a step, depending +on whether its input have changed or not. All metadata can be checked into a Git +repo, so you'll always be able to reproduce your experiments. + +To set up DVC, install the package and initialize your spaCy project as a Git +and DVC repo. You can also +[customize your DVC installation](https://dvc.org/doc/install/macos#install-with-pip) +to include support for remote storage like Google Cloud Storage, S3, Azure, SSH +and more. + +```bash +pip install dvc # Install DVC +git init # Initialize a Git repo +dvc init # Initialize a DVC project +``` + +The [`spacy project dvc`](/api/cli#project-dvc) command creates a `dvc.yaml` +config file based on a workflow defined in your `project.yml`. Whenever you +update your project, you can re-run the command to update your DVC config. You +can then manage your spaCy project like any other DVC project, run +[`dvc add`](https://dvc.org/doc/command-reference/add) to add and track assets +and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the +workflow or individual commands. + +```bash +$ python -m spacy project dvc [workflow name] +``` + + + +DVC currently expects a single workflow per project, so when creating the config +with [`spacy project dvc`](/api/cli#project-dvc), you need to specify the name +of a workflow defined in your `project.yml`. You can still use multiple +workflows, but only one can be tracked by DVC. + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### Prodigy {#prodigy} + +[Prodigy](https://prodi.gy) is a modern annotation tool for creating training +data for machine learning models, developed by us. It integrates with spaCy +out-of-the-box and provides many different +[annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks, +with and without a model in the loop. If Prodigy is installed in your project, +you can start the annotation server from your `project.yml` for a tight feedback +loop between data development and training. + +The following example command starts the Prodigy app using the +[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in +suggestions for the given entity labels produced by a pretrained model. You can +then correct the suggestions manually in the UI. After you save and exit the +server, the full dataset is exported in spaCy's format and split into a training +and evaluation set. + +> #### Example usage +> +> ```bash +> $ python -m spacy project run annotate +> ``` + + +```yaml +### project.yml +variables: + PRODIGY_DATASET: 'ner_articles' + PRODIGY_LABELS: 'PERSON,ORG,PRODUCT' + PRODIGY_MODEL: 'en_core_web_md' + +commands: + - name: annotate + - script: + - 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}' + - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}' + - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' + - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy' + - deps: + - 'assets/raw_data.jsonl' + - outputs: + - 'corpus/train.spacy' + - 'corpus/eval.spacy' +``` + +You can use the same approach for other types of projects and annotation +workflows, including +[text classification](https://prodi.gy/docs/recipes#textcat), +[dependency parsing](https://prodi.gy/docs/recipes#dep), +[part-of-speech tagging](https://prodi.gy/docs/recipes#pos) or fully +[custom recipes](https://prodi.gy/docs/custom-recipes) – for instance, an A/B +evaluation workflow that lets you compare two different models and their +results. + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### Streamlit {#streamlit} + + + +
+ +[Streamlit](https://streamlit.io) is a Python framework for building interactive +data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) +package helps you integrate spaCy visualizations into your Streamlit apps and +quickly spin up demos to explore your models interactively. It includes a full +embedded visualizer, as well as individual components. + +```bash +$ pip install spacy_streamlit +``` + +
+ +![](../images/spacy-streamlit.png) + +
+ +Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your +projects can easily define their own scripts that spin up an interactive +visualizer, using the latest model you trained, or a selection of models so you +can compare their results. The following script starts an +[NER visualizer](/usage/visualizers#ent) and takes two positional command-line +argument you can pass in from your `config.yml`: a comma-separated list of model +paths and an example text to use as the default text. + +```python +### scripts/visualize.py +import spacy_streamlit +import sys + +DEFAULT_TEXT = sys.argv[2] if len(sys.argv) >= 3 else "" +MODELS = [name.strip() for name in sys.argv[1].split(",")] +spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"]) +``` + +> #### Example usage +> +> ```bash +> $ python -m spacy project run visualize +> ``` + + +```yaml +### project.yml +commands: + - name: visualize + help: "Visualize the model's output interactively using Streamlit" + script: + - 'streamlit run ./scripts/visualize.py ./training/model-best "I like Adidas shoes."' + deps: + - 'training/model-best' +``` + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### FastAPI {#fastapi} + +[FastAPI](https://fastapi.tiangolo.com/) is a modern high-performance framework +for building REST APIs with Python, based on Python +[type hints](https://fastapi.tiangolo.com/python-types/). It's become a popular +library for serving machine learning models and + +```python +# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.) +``` + +> #### Example usage +> +> ```bash +> $ python -m spacy project run visualize +> ``` + + +```yaml +### project.yml +commands: + - name: serve + help: "Serve the trained model with FastAPI" + script: + - 'python ./scripts/serve.py ./training/model-best' + deps: + - 'training/model-best' + no_skip: true +``` + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### Ray {#ray} + + + +--- + +### Weights & Biases {#wandb} + + diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index e89e41586..392bcf0c0 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -552,7 +552,7 @@ component with different patterns, depending on your application: html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json") ``` - + For more details and examples of how to **create custom pipeline components** and **extension attributes**, see the diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 0cfe404f2..19580dc0f 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -198,7 +198,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md' - + To learn more about how spaCy's tokenization rules work in detail, how to **customize and replace** the default tokenizer and how to **add @@ -214,7 +214,7 @@ import PosDeps101 from 'usage/101/\_pos-deps.md' - + To learn more about **part-of-speech tagging** and rule-based morphology, and how to **navigate and use the parse tree** effectively, see the usage guides on @@ -229,7 +229,7 @@ import NER101 from 'usage/101/\_named-entities.md' - + To learn more about entity recognition in spaCy, how to **add your own entities** to a document and how to **train and update** the entity predictions @@ -245,7 +245,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md' - + To learn more about word vectors, how to **customize them** and how to load **your own vectors** into spaCy, see the usage guide on @@ -259,7 +259,7 @@ import Pipelines101 from 'usage/101/\_pipelines.md' - + To learn more about **how processing pipelines work** in detail, how to enable and disable their components, and how to **create your own**, see the usage @@ -458,7 +458,7 @@ import Serialization101 from 'usage/101/\_serialization.md' - + To learn more about how to **save and load your own models**, see the usage guide on [saving and loading](/usage/saving-loading#models). @@ -471,7 +471,7 @@ import Training101 from 'usage/101/\_training.md' - + To learn more about **training and updating** models, how to create training data and how to improve spaCy's named entity recognition models, see the usage @@ -485,14 +485,6 @@ import LanguageData101 from 'usage/101/\_language-data.md' - - -To learn more about the individual components of the language data and how to -**add a new language** to spaCy in preparation for training a language model, -see the usage guide on [adding languages](/usage/adding-languages). - - - ## Lightning tour {#lightning-tour} The following examples and code snippets give you an overview of spaCy's @@ -641,8 +633,9 @@ for ent in doc.ents: ### Train and update neural network models {#lightning-tour-training"} ```python -import spacy import random +import spacy +from spacy.gold import Example nlp = spacy.load("en_core_web_sm") train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] @@ -652,7 +645,9 @@ with nlp.select_pipes(enable="ner"): for i in range(10): random.shuffle(train_data) for text, annotations in train_data: - nlp.update([text], [annotations], sgd=optimizer) + doc = nlp.make_doc(text) + example = Example.from_dict(doc, annotations) + nlp.update([example], sgd=optimizer) nlp.to_disk("/model") ``` diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 2bbf5dddd..597ade4e6 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -4,8 +4,8 @@ next: /usage/projects menu: - ['Introduction', 'basics'] - ['CLI & Config', 'cli-config'] - - ['Custom Models', 'custom-models'] - ['Transfer Learning', 'transfer-learning'] + - ['Custom Models', 'custom-models'] - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -103,26 +103,38 @@ still look good. > #### Migration from spaCy v2.x > -> TODO: ... +> TODO: once we have an answer for how to update the training command +> (`spacy migrate`?), add details here Training config files include all **settings and hyperparameters** for training your model. Instead of providing lots of arguments on the command line, you only -need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). +need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Under +the hood, the training config uses the +[configuration system](https://thinc.ai/docs/usage-config) provided by our +machine learning library [Thinc](https://thinc.ai). This also makes it easy to +integrate custom models and architectures, written in your framework of choice. +Some of the main advantages and features of spaCy's training config are: -To read more about how the config system works under the hood, check out the -[Thinc documentation](https://thinc.ai/docs/usage-config). - -- **Structured sections.** +- **Structured sections.** The config is grouped into sections, and nested + sections are defined using the `.` notation. For example, `[nlp.pipeline.ner]` + defines the settings for the pipeline's named entity recognizer. The config + can be loaded as a Python dict. - **References to registered functions.** Sections can refer to registered functions like [model architectures](/api/architectures), [optimizers](https://thinc.ai/docs/api-optimizers) or [schedules](https://thinc.ai/docs/api-schedules) and define arguments that are passed into them. You can also register your own functions to define - [custom architectures](#custom-models), reference them in your config, + [custom architectures](#custom-models), reference them in your config and + tweak their parameters. - **Interpolation.** If you have hyperparameters used by multiple components, define them once and reference them as variables. - - +- **Reproducibility with no hidden defaults.** The config file is the "single + source of truth" and includes all settings. +- **Automated checks and validation.** When you load a config, spaCy checks if + the settings are complete and if all values have the correct types. This lets + you catch potential mistakes early. In your custom architectures, you can use + Python [type hints](https://docs.python.org/3/library/typing.html) to tell the + config which types of data to expect. @@ -181,26 +193,60 @@ pretrained_vectors = null dropout = null ``` + + + + +For a full overview of spaCy's config format and settings, see the +[training format documentation](/api/data-formats#config). The settings +available for the different architectures are documented with the +[model architectures API](/api/architectures). See the Thinc documentation for +[optimizers](https://thinc.ai/docs/api-optimizers) and +[schedules](https://thinc.ai/docs/api-schedules). + + + +#### Using registered functions {#config-functions} + +The training configuration defined in the config file doesn't have to only +consist of static values. Some settings can also be **functions**. For instance, +the `batch_size` can be a number that doesn't change, or a schedule, like a +sequence of compounding values, which has shown to be an effective trick (see +[Smith et al., 2017](https://arxiv.org/abs/1711.00489)). + +```ini +### With static value +[training] +batch_size = 128 +``` + +To refer to a function instead, you can make `[training.batch_size]` its own +section and use the `@` syntax specify the function and its arguments – in this +case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) defined +in the [function registry](/api/top-level#registry). All other values defined in +the block are passed to the function as keyword arguments when it's initialized. +You can also use this mechanism to register +[custom implementations and architectures](#custom-models) and reference them +from your configs. + +> #### TODO +> +> TODO: something about how the tree is built bottom-up? + +```ini +### With registered function +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +``` + ### Model architectures {#model-architectures} -## Custom model implementations and architectures {#custom-models} - - - - - -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. - - - -### Training with custom code - - - + ## Transfer learning {#transfer-learning} @@ -220,6 +266,101 @@ visualize your model. +## Custom model implementations and architectures {#custom-models} + + + +### Training with custom code {#custom-code} + +The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument +`--code` that points to a Python file. The file is imported before training and +allows you to add custom functions and architectures to the function registry +that can then be referenced from your `config.cfg`. This lets you train spaCy +models with custom components, without having to re-implement the whole training +workflow. + +For example, let's say you've implemented your own batch size schedule to use +during training. The `@spacy.registry.schedules` decorator lets you register +that function in the `schedules` [registry](/api/top-level#registry) and assign +it a string name: + +> #### Why the version in the name? +> +> A big benefit of the config system is that it makes your experiments +> reproducible. We recommend versioning the functions you register, especially +> if you expect them to change (like a new model architecture). This way, you +> know that a config referencing `v1` means a different function than a config +> referencing `v2`. + +```python +### functions.py +import spacy + +@spacy.registry.schedules("my_custom_schedule.v1") +def my_custom_schedule(start: int = 1, factor: int = 1.001): + while True: + yield start + start = start * factor +``` + +In your config, you can now reference the schedule in the +`[training.batch_size]` block via `@schedules`. If a block contains a key +starting with an `@`, it's interpreted as a reference to a function. All other +settings in the block will be passed to the function as keyword arguments. Keep +in mind that the config shouldn't have any hidden defaults and all arguments on +the functions need to be represented in the config. + + + +```ini +### config.cfg (excerpt) +[training.batch_size] +@schedules = "my_custom_schedule.v1" +start = 2 +factor = 1.005 +``` + +You can now run [`spacy train`](/api/cli#train) with the `config.cfg` and your +custom `functions.py` as the argument `--code`. Before loading the config, spaCy +will import the `functions.py` module and your custom functions will be +registered. + +```bash +### Training with custom code {wrap="true"} +python -m spacy train train.spacy dev.spacy config.cfg --output ./output --code ./functions.py +``` + + + +spaCy's configs are powered by our machine learning library Thinc's +[configuration system](https://thinc.ai/docs/usage-config), which supports +[type hints](https://docs.python.org/3/library/typing.html) and even +[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types) +using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered +function provides For example, `start: int` in the example above will ensure +that the value received as the argument `start` is an integer. If the value +can't be cast to an integer, spaCy will raise an error. +`start: pydantic.StrictInt` will force the value to be an integer and raise an +error if it's not – for instance, if your config defines a float. + + + +### Defining custom architectures {#custom-architectures} + + + +### Wrapping PyTorch and TensorFlow {#custom-frameworks} + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + ## Parallel Training with Ray {#parallel-training} @@ -234,45 +375,93 @@ mattis pretium. ## Internal training API {#api} - + -The [`GoldParse`](/api/goldparse) object collects the annotated training -examples, also called the **gold standard**. It's initialized with the -[`Doc`](/api/doc) object it refers to, and keyword arguments specifying the -annotations, like `tags` or `entities`. Its job is to encode the annotations, -keep them aligned and create the C-level data structures required for efficient -access. Here's an example of a simple `GoldParse` for part-of-speech tags: +spaCy gives you full control over the training loop. However, for most use +cases, it's recommended to train your models via the +[`spacy train`](/api/cli#train) command with a [`config.cfg`](#config) to keep +track of your settings and hyperparameters, instead of writing your own training +scripts from scratch. + + + + + +The [`Example`](/api/example) object contains annotated training data, also +called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object +that will hold the predictions, and another `Doc` object that holds the +gold-standard annotations. Here's an example of a simple `Example` for +part-of-speech tags: ```python -vocab = Vocab(tag_map={"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}) -doc = Doc(vocab, words=["I", "like", "stuff"]) -gold = GoldParse(doc, tags=["N", "V", "N"]) +words = ["I", "like", "stuff"] +predicted = Doc(vocab, words=words) +# create the reference Doc with gold-standard TAG annotations +tags = ["NOUN", "VERB", "NOUN"] +tag_ids = [vocab.strings.add(tag) for tag in tags] +reference = Doc(vocab, words=words).from_array("TAG", numpy.array(tag_ids, dtype="uint64")) +example = Example(predicted, reference) ``` -Using the `Doc` and its gold-standard annotations, the model can be updated to -learn a sentence of three words with their assigned part-of-speech tags. The -[tag map](/usage/adding-languages#tag-map) is part of the vocabulary and defines -the annotation scheme. If you're training a new language model, this will let -you map the tags present in the treebank you train on to spaCy's tag scheme. +Alternatively, the `reference` `Doc` with the gold-standard annotations can be +created from a dictionary with keyword arguments specifying the annotations, +like `tags` or `entities`. Using the `Example` object and its gold-standard +annotations, the model can be updated to learn a sentence of three words with +their assigned part-of-speech tags. + +> #### About the tag map +> +> The tag map is part of the vocabulary and defines the annotation scheme. If +> you're training a new language model, this will let you map the tags present +> in the treebank you train on to spaCy's tag scheme: +> +> ```python +> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}} +> vocab = Vocab(tag_map=tag_map) +> ``` ```python -doc = Doc(Vocab(), words=["Facebook", "released", "React", "in", "2014"]) -gold = GoldParse(doc, entities=["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]) +words = ["I", "like", "stuff"] +tags = ["NOUN", "VERB", "NOUN"] +predicted = Doc(nlp.vocab, words=words) +example = Example.from_dict(predicted, {"tags": tags}) ``` -The same goes for named entities. The letters added before the labels refer to -the tags of the [BILUO scheme](/usage/linguistic-features#updating-biluo) – `O` -is a token outside an entity, `U` an single entity unit, `B` the beginning of an -entity, `I` a token inside an entity and `L` the last token of an entity. +Here's another example that shows how to define gold-standard named entities. +The letters added before the labels refer to the tags of the +[BILUO scheme](/usage/linguistic-features#updating-biluo) – `O` is a token +outside an entity, `U` an single entity unit, `B` the beginning of an entity, +`I` a token inside an entity and `L` the last token of an entity. + +```python +doc = Doc(nlp.vocab, words=["Facebook", "released", "React", "in", "2014"]) +example = Example.from_dict(doc, {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]}) +``` + + + +As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class. +It can be constructed in a very similar way, from a `Doc` and a dictionary of +annotations: + +```diff +- gold = GoldParse(doc, entities=entities) ++ example = Example.from_dict(doc, {"entities": entities}) +``` + + > - **Training data**: The training examples. > - **Text and label**: The current example. > - **Doc**: A `Doc` object created from the example text. -> - **GoldParse**: A `GoldParse` object of the `Doc` and label. +> - **Example**: An `Example` object holding both predictions and gold-standard +> annotations. > - **nlp**: The `nlp` object with the model. > - **Optimizer**: A function that holds state between updates. > - **Update**: Update the model's weights. + + ![The training loop](../images/training-loop.svg) Of course, it's not enough to only show a model a single example once. @@ -286,34 +475,47 @@ dropout means that each feature or internal representation has a 1/4 likelihood of being dropped. > - [`begin_training`](/api/language#begin_training): Start the training and -> return an optimizer function to update the model's weights. Can take an -> optional function converting the training data to spaCy's training format. -> - [`update`](/api/language#update): Update the model with the training example -> and gold data. +> return an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object to +> update the model's weights. +> - [`update`](/api/language#update): Update the model with the training +> examplea. > - [`to_disk`](/api/language#to_disk): Save the updated model to a directory. ```python ### Example training loop -optimizer = nlp.begin_training(get_data) +optimizer = nlp.begin_training() for itn in range(100): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) - gold = GoldParse(doc, entities=entity_offsets) - nlp.update([doc], [gold], drop=0.5, sgd=optimizer) + example = Example.from_dict(doc, {"entities": entity_offsets}) + nlp.update([example], sgd=optimizer) nlp.to_disk("/model") ``` The [`nlp.update`](/api/language#update) method takes the following arguments: -| Name | Description | -| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | [`Doc`](/api/doc) objects. The `update` method takes a sequence of them, so you can batch up your training examples. Alternatively, you can also pass in a sequence of raw texts. | -| `golds` | [`GoldParse`](/api/goldparse) objects. The `update` method takes a sequence of them, so you can batch up your training examples. Alternatively, you can also pass in a dictionary containing the annotations. | -| `drop` | Dropout rate. Makes it harder for the model to just memorize the data. | -| `sgd` | An optimizer, i.e. a callable to update the model's weights. If not set, spaCy will create a new one and save it for further use. | +| Name | Description | +| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | [`Example`](/api/example) objects. The `update` method takes a sequence of them, so you can batch up your training examples. | +| `drop` | Dropout rate. Makes it harder for the model to just memorize the data. | +| `sgd` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object, which updated the model's weights. If not set, spaCy will create a new one and save it for further use. | -Instead of writing your own training loop, you can also use the built-in -[`train`](/api/cli#train) command, which expects data in spaCy's -[JSON format](/api/data-formats#json-input). On each epoch, a model will be -saved out to the directory. + + +As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class +and the "simple training style" of calling `nlp.update` with a text and a +dictionary of annotations. Updating your code to use the `Example` object should +be very straightforward: you can call +[`Example.from_dict`](/api/example#from_dict) with a [`Doc`](/api/doc) and the +dictionary of annotations: + +```diff +text = "Facebook released React in 2014" +annotations = {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]} ++ example = Example.from_dict(nlp.make_doc(text), {"entities": entities}) +- nlp.update([text], [annotations]) ++ nlp.update([example]) +``` + + diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md index 49b651d9e..c3a73d4db 100644 --- a/website/docs/usage/vectors-embeddings.md +++ b/website/docs/usage/vectors-embeddings.md @@ -186,7 +186,7 @@ underlying [`Lexeme`](/api/lexeme), while [`Doc.vector`](/api/doc#vector) and tokens. You can customize these behaviors by modifying the `doc.user_hooks`, `doc.user_span_hooks` and `doc.user_token_hooks` dictionaries. - + For more details on **adding hooks** and **overwriting** the built-in `Doc`, `Span` and `Token` methods, see the usage guide on diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 6b533b739..5db741d52 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -4,7 +4,7 @@ teaser: Visualize dependencies and entities in your browser or in a notebook new: 2 menu: - ['Dependencies', 'dep'] - - ['Entities', 'ent'] + - ['Named Entities', 'ent'] - ['Jupyter Notebooks', 'jupyter'] - ['Rendering HTML', 'html'] - ['Web app usage', 'webapp'] @@ -356,6 +356,6 @@ Alternatively, if you're using [Streamlit](https://streamlit.io), check out the helps you integrate spaCy visualizations into your apps. It includes a full embedded visualizer, as well as individual components. -![](../images/spacy-streamlit.png)] +![](../images/spacy-streamlit.png) diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 18b14751e..3fed561d0 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -79,7 +79,9 @@ "items": [ { "text": "Language", "url": "/api/language" }, { "text": "Tokenizer", "url": "/api/tokenizer" }, + { "text": "Tok2Vec", "url": "/api/tok2vec" }, { "text": "Lemmatizer", "url": "/api/lemmatizer" }, + { "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "Tagger", "url": "/api/tagger" }, { "text": "DependencyParser", "url": "/api/dependencyparser" }, { "text": "EntityRecognizer", "url": "/api/entityrecognizer" }, diff --git a/website/src/components/card.js b/website/src/components/card.js index ca4619b06..fee381c5e 100644 --- a/website/src/components/card.js +++ b/website/src/components/card.js @@ -1,29 +1,32 @@ import React from 'react' import PropTypes from 'prop-types' +import classNames from 'classnames' import Link from './link' import { H5 } from './typography' import classes from '../styles/card.module.sass' -const Card = ({ title, to, image, header, onClick, children }) => ( -
+const Card = ({ title, to, image, header, small, onClick, children }) => ( +
{header && ( {header} )} -
- {image && ( -
- -
- )} - {title && ( - - {title} - - )} -
+ {(title || image) && ( +
+ {image && ( +
+ +
+ )} + {title && ( + + {title} + + )} +
+ )} {children} @@ -31,10 +34,10 @@ const Card = ({ title, to, image, header, onClick, children }) => ( ) Card.propTypes = { - title: PropTypes.string, + title: PropTypes.node, + header: PropTypes.node, to: PropTypes.string, image: PropTypes.string, - card: PropTypes.node, onClick: PropTypes.func, children: PropTypes.node, } diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 6af24a6ca..06c5fbb95 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -5,7 +5,7 @@ import classNames from 'classnames' import Icon from './icon' import classes from '../styles/infobox.module.sass' -const Infobox = ({ title, id, variant, className, children }) => { +const Infobox = ({ title, emoji, id, variant, className, children }) => { const infoboxClassNames = classNames(classes.root, className, { [classes.warning]: variant === 'warning', [classes.danger]: variant === 'danger', @@ -17,7 +17,14 @@ const Infobox = ({ title, id, variant, className, children }) => { {variant !== 'default' && ( )} - {title} + + {emoji && ( + + )} + {title} + )} {children} @@ -30,7 +37,7 @@ Infobox.defaultProps = { } Infobox.propTypes = { - title: PropTypes.string, + title: PropTypes.node, id: PropTypes.string, variant: PropTypes.oneOf(['default', 'warning', 'danger']), className: PropTypes.string, diff --git a/website/src/components/table.js b/website/src/components/table.js index 85b8e2144..1a7d460d0 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -26,6 +26,16 @@ function getCellContent(children) { return children } +function isDividerRow(children) { + if (children.length && children[0].props && children[0].props.name == 'td') { + const tdChildren = children[0].props.children + if (tdChildren && !Array.isArray(tdChildren) && tdChildren.props) { + return tdChildren.props.name === 'em' + } + } + return false +} + function isFootRow(children) { const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/ if (children.length && children[0].props.name === 'td') { @@ -53,9 +63,11 @@ export const Th = props => export const Tr = ({ evenodd = true, children, ...props }) => { const foot = isFootRow(children) + const isDivider = isDividerRow(children) const trClasssNames = classNames({ [classes.tr]: evenodd, [classes.footer]: foot, + [classes.divider]: isDivider, 'table-footer': foot, }) diff --git a/website/src/images/logos/dvc.svg b/website/src/images/logos/dvc.svg new file mode 100644 index 000000000..258ab1374 --- /dev/null +++ b/website/src/images/logos/dvc.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/website/src/images/logos/fastapi.svg b/website/src/images/logos/fastapi.svg new file mode 100644 index 000000000..bdd514a4b --- /dev/null +++ b/website/src/images/logos/fastapi.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/website/src/images/logos/prodigy.svg b/website/src/images/logos/prodigy.svg new file mode 100644 index 000000000..3f318b793 --- /dev/null +++ b/website/src/images/logos/prodigy.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/src/images/logos/ray.svg b/website/src/images/logos/ray.svg new file mode 100644 index 000000000..3e7390dce --- /dev/null +++ b/website/src/images/logos/ray.svg @@ -0,0 +1,4 @@ + + + + diff --git a/website/src/images/logos/streamlit.svg b/website/src/images/logos/streamlit.svg new file mode 100644 index 000000000..3c55deb55 --- /dev/null +++ b/website/src/images/logos/streamlit.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/website/src/images/logos/wandb.svg b/website/src/images/logos/wandb.svg new file mode 100644 index 000000000..e3f8ea7fa --- /dev/null +++ b/website/src/images/logos/wandb.svg @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/src/styles/card.module.sass b/website/src/styles/card.module.sass index d9e0633cf..629607bd5 100644 --- a/website/src/styles/card.module.sass +++ b/website/src/styles/card.module.sass @@ -5,6 +5,15 @@ font: var(--font-size-md)/var(--line-height-md) var(--font-primary) margin-bottom: var(--spacing-sm) +.small + padding: 1.5rem + font-size: var(--font-size-sm) + line-height: var(--line-height-sm) + color: var(--color-dark) + + .title + margin-bottom: var(--spacing-xs) + .image $image-size: 35px width: $image-size diff --git a/website/src/styles/infobox.module.sass b/website/src/styles/infobox.module.sass index 2be59f33b..baf9919c3 100644 --- a/website/src/styles/infobox.module.sass +++ b/website/src/styles/infobox.module.sass @@ -31,6 +31,9 @@ position: relative bottom: -2px +.emoji + margin-right: 0.65em + .warning --color-theme: var(--color-yellow-dark) --color-theme-dark: var(--color-yellow-dark) diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index 56f1a5aa6..4b63324b9 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -25,7 +25,7 @@ --line-height-sm: 1.375 --line-height-md: 1.5 --line-height-lg: 1.9 - --line-height-code: 1.8 + --line-height-code: 1.7 // Spacing --spacing-xs: 1rem @@ -271,7 +271,7 @@ body color: var(--color-front) p - margin-bottom: var(--spacing-md) + margin-bottom: var(--spacing-sm) font-family: var(--font-primary) font-size: var(--font-size-md) line-height: var(--line-height-md) diff --git a/website/src/styles/table.module.sass b/website/src/styles/table.module.sass index 68cc4bace..7a82a26fe 100644 --- a/website/src/styles/table.module.sass +++ b/website/src/styles/table.module.sass @@ -49,6 +49,36 @@ border-bottom: 2px solid var(--color-theme) vertical-align: bottom +.divider + height: 0 + border-bottom: 1px solid var(--color-subtle) + + td + top: -1px + height: 0 + position: relative + padding: 0 !important + + & + tr td + padding-top: 12px + + td em + position: absolute + top: -5px + left: 10px + display: inline-block + background: var(--color-theme) + color: var(--color-back) + padding: 0 5px 1px + font-size: 0.85rem + text-transform: uppercase + font-weight: bold + border: 0 + border-radius: 1em + font-style: normal + white-space: nowrap + z-index: 5 + // Responsive table // Shadows adapted from "CSS only Responsive Tables" by David Bushell // http://codepen.io/dbushell/pen/wGaamR diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 7f9314d9d..c97663317 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -33,6 +33,7 @@ import { YouTube, SoundCloud, Iframe, Image } from '../components/embed' import Alert from '../components/alert' import Search from '../components/search' import Project from '../widgets/project' +import { Integration, IntegrationLogo } from '../widgets/integration' const mdxComponents = { a: Link, @@ -75,6 +76,8 @@ const scopeComponents = { Grid, InlineCode, Project, + Integration, + IntegrationLogo, } const AlertSpace = ({ nightly }) => { diff --git a/website/src/widgets/integration.js b/website/src/widgets/integration.js new file mode 100644 index 000000000..50a84f26c --- /dev/null +++ b/website/src/widgets/integration.js @@ -0,0 +1,46 @@ +import React from 'react' + +import Card from '../components/card' + +import { ReactComponent as DVCLogo } from '../images/logos/dvc.svg' +import { ReactComponent as ProdigyLogo } from '../images/logos/prodigy.svg' +import { ReactComponent as StreamlitLogo } from '../images/logos/streamlit.svg' +import { ReactComponent as FastAPILogo } from '../images/logos/fastapi.svg' +import { ReactComponent as WandBLogo } from '../images/logos/wandb.svg' +import { ReactComponent as RayLogo } from '../images/logos/ray.svg' + +const LOGOS = { + dvc: DVCLogo, + prodigy: ProdigyLogo, + streamlit: StreamlitLogo, + fastapi: FastAPILogo, + wandb: WandBLogo, + ray: RayLogo, +} + +export const IntegrationLogo = ({ name, title, width, height, maxWidth, align, ...props }) => { + const Logo = LOGOS[name] + if (!Logo) throw new Error(`Unknown logo: ${name}`) + const style = { maxWidth, float: align || 'none' } + return ( + + ) +} + +export const Integration = ({ height = 30, url, logo, title, children }) => { + const header = logo && ( + + ) + return ( + + {children} + + ) +} diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js index f1c18cf7a..d46472706 100644 --- a/website/src/widgets/project.js +++ b/website/src/widgets/project.js @@ -15,14 +15,14 @@ const Project = ({ id, repo, children }) => { const url = `${repo || DEFAULT_REPO}/${id}` const title = ( <> - 🪐 Get started with a project template:{' '} + Get started with a project template:{' '} {id} ) return ( - + {children}