mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Merge remote-tracking branch 'origin/develop' into rliaw-develop
This commit is contained in:
		
						commit
						3bccf8b954
					
				|  | @ -5,16 +5,16 @@ | |||
| # data is passed in sentence-by-sentence via some prior preprocessing. | ||||
| gold_preproc = false | ||||
| # Limitations on training document length or number of examples. | ||||
| max_length = 5000 | ||||
| max_length = 3000 | ||||
| limit = 0 | ||||
| # Data augmentation | ||||
| orth_variant_level = 0.0 | ||||
| dropout = 0.2 | ||||
| dropout = 0.1 | ||||
| # Controls early-stopping. 0 or -1 mean unlimited. | ||||
| patience = 1600 | ||||
| patience = 100000 | ||||
| max_epochs = 0 | ||||
| max_steps = 20000 | ||||
| eval_frequency = 500 | ||||
| max_steps = 0 | ||||
| eval_frequency = 1000 | ||||
| # Other settings | ||||
| seed = 0 | ||||
| accumulate_gradient = 1 | ||||
|  | @ -26,6 +26,7 @@ score_weights = {"ents_f": 1.0} | |||
| init_tok2vec = null | ||||
| discard_oversize = false | ||||
| omit_extra_lookups = false | ||||
| batch_by = "words" | ||||
| 
 | ||||
| [training.batch_size] | ||||
| @schedules = "compounding.v1" | ||||
|  | @ -37,19 +38,13 @@ compound = 1.001 | |||
| @optimizers = "Adam.v1" | ||||
| beta1 = 0.9 | ||||
| beta2 = 0.999 | ||||
| L2_is_weight_decay = false | ||||
| L2 = 1e-6 | ||||
| L2_is_weight_decay = true | ||||
| L2 = 0.01 | ||||
| grad_clip = 1.0 | ||||
| use_averages = true | ||||
| eps = 1e-8 | ||||
| learn_rate = 0.001 | ||||
| 
 | ||||
| #[optimizer.learn_rate] | ||||
| #@schedules = "warmup_linear.v1" | ||||
| #warmup_steps = 250 | ||||
| #total_steps = 20000 | ||||
| #initial_rate = 0.001 | ||||
| 
 | ||||
| [nlp] | ||||
| lang = "en" | ||||
| vectors = null | ||||
|  | @ -58,8 +53,6 @@ vectors = null | |||
| factory = "ner" | ||||
| learn_tokens = false | ||||
| min_action_freq = 1 | ||||
| beam_width = 1 | ||||
| beam_update_prob = 1.0 | ||||
| 
 | ||||
| [nlp.pipeline.ner.model] | ||||
| @architectures = "spacy.TransitionBasedParser.v1" | ||||
|  |  | |||
|  | @ -1,8 +1,7 @@ | |||
| # fmt: off | ||||
| __title__ = "spacy-nightly" | ||||
| __version__ = "3.0.0a2" | ||||
| __version__ = "3.0.0a4" | ||||
| __release__ = True | ||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||
| __shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json" | ||||
| __projects__ = "https://github.com/explosion/spacy-boilerplates" | ||||
|  |  | |||
|  | @ -11,12 +11,15 @@ from .profile import profile  # noqa: F401 | |||
| from .train import train_cli  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .debug_data import debug_data  # noqa: F401 | ||||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .convert import convert  # noqa: F401 | ||||
| from .init_model import init_model  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| from .project import project_clone, project_assets, project_run  # noqa: F401 | ||||
| from .project import project_run_all  # noqa: F401 | ||||
| from .project.clone import project_clone  # noqa: F401 | ||||
| from .project.assets import project_assets  # noqa: F401 | ||||
| from .project.run import project_run  # noqa: F401 | ||||
| from .project.dvc import project_update_dvc  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||
|  |  | |||
|  | @ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface | |||
| 
 | ||||
| DOCS: https://spacy.io/api/cli | ||||
| """ | ||||
| PROJECT_HELP = f"""Command-line interface for spaCy projects and working with | ||||
| project templates. You'd typically start by cloning a project template to a local | ||||
| directory and fetching its assets like datasets etc. See the project's | ||||
| project.yml for the available commands. | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| app = typer.Typer(name=NAME, help=HELP) | ||||
| project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) | ||||
| app.add_typer(project_cli) | ||||
| 
 | ||||
| # Wrappers for Typer's annotations. Initially created to set defaults and to | ||||
| # keep the names short, but not needed at the moment. | ||||
|  |  | |||
|  | @ -120,8 +120,12 @@ def convert( | |||
|             no_print=silent, | ||||
|             ner_map=ner_map, | ||||
|         ) | ||||
|         if file_type == "json": | ||||
|             data = [docs_to_json(docs)] | ||||
|         else: | ||||
|             data = DocBin(docs=docs, store_user_data=True).to_bytes() | ||||
|         if output_dir == "-": | ||||
|             _print_docs_to_stdout(docs, file_type) | ||||
|             _print_docs_to_stdout(data, file_type) | ||||
|         else: | ||||
|             if input_loc != input_path: | ||||
|                 subpath = input_loc.relative_to(input_path) | ||||
|  | @ -129,24 +133,23 @@ def convert( | |||
|             else: | ||||
|                 output_file = Path(output_dir) / input_loc.parts[-1] | ||||
|                 output_file = output_file.with_suffix(f".{file_type}") | ||||
|             _write_docs_to_file(docs, output_file, file_type) | ||||
|             _write_docs_to_file(data, output_file, file_type) | ||||
|             msg.good(f"Generated output file ({len(docs)} documents): {output_file}") | ||||
| 
 | ||||
| 
 | ||||
| def _print_docs_to_stdout(docs, output_type): | ||||
| def _print_docs_to_stdout(data, output_type): | ||||
|     if output_type == "json": | ||||
|         srsly.write_json("-", [docs_to_json(docs)]) | ||||
|         srsly.write_json("-", data) | ||||
|     else: | ||||
|         sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) | ||||
|         sys.stdout.buffer.write(data) | ||||
| 
 | ||||
| 
 | ||||
| def _write_docs_to_file(docs, output_file, output_type): | ||||
| def _write_docs_to_file(data, output_file, output_type): | ||||
|     if not output_file.parent.exists(): | ||||
|         output_file.parent.mkdir(parents=True) | ||||
|     if output_type == "json": | ||||
|         srsly.write_json(output_file, [docs_to_json(docs)]) | ||||
|         srsly.write_json(output_file, data) | ||||
|     else: | ||||
|         data = DocBin(docs=docs, store_user_data=True).to_bytes() | ||||
|         with output_file.open("wb") as file_: | ||||
|             file_.write(data) | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										168
									
								
								spacy/cli/debug_model.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								spacy/cli/debug_model.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,168 @@ | |||
| from typing import List | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._app import app, Arg, Opt | ||||
| from .. import util | ||||
| from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam | ||||
| from ..lang.en import English | ||||
| 
 | ||||
| 
 | ||||
| @app.command("debug-model") | ||||
| def debug_model_cli( | ||||
|     # fmt: off | ||||
|     config_path: Path = Arg(..., help="Path to config file", exists=True), | ||||
|     layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"), | ||||
|     dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), | ||||
|     parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), | ||||
|     gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), | ||||
|     attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"), | ||||
|     P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), | ||||
|     P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), | ||||
|     P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), | ||||
|     P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"), | ||||
|     use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), | ||||
|     seed: int = Opt(None, "--seed", "-s", help="Use GPU"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|     Analyze a Thinc ML model - internal structure and activations during training | ||||
|     """ | ||||
|     print_settings = { | ||||
|         "dimensions": dimensions, | ||||
|         "parameters": parameters, | ||||
|         "gradients": gradients, | ||||
|         "attributes": attributes, | ||||
|         "layers": [int(x.strip()) for x in layers.split(",")] if layers else [], | ||||
|         "print_before_training": P0, | ||||
|         "print_after_init": P1, | ||||
|         "print_after_training": P2, | ||||
|         "print_prediction": P3, | ||||
|     } | ||||
| 
 | ||||
|     if seed is not None: | ||||
|         msg.info(f"Fixing random seed: {seed}") | ||||
|         fix_random_seed(seed) | ||||
|     if use_gpu >= 0: | ||||
|         msg.info(f"Using GPU: {use_gpu}") | ||||
|         require_gpu(use_gpu) | ||||
|     else: | ||||
|         msg.info(f"Using CPU") | ||||
| 
 | ||||
|     debug_model( | ||||
|         config_path, | ||||
|         print_settings=print_settings, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def debug_model( | ||||
|     config_path: Path, | ||||
|     *, | ||||
|     print_settings=None | ||||
| ): | ||||
|     if print_settings is None: | ||||
|         print_settings = {} | ||||
| 
 | ||||
|     model = util.load_config(config_path, create_objects=True)["model"] | ||||
| 
 | ||||
|     # STEP 0: Printing before training | ||||
|     msg.info(f"Analysing model with ID {model.id}") | ||||
|     if print_settings.get("print_before_training"): | ||||
|         msg.info(f"Before training:") | ||||
|         _print_model(model, print_settings) | ||||
| 
 | ||||
|     # STEP 1: Initializing the model and printing again | ||||
|     model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp)) | ||||
|     if print_settings.get("print_after_init"): | ||||
|         msg.info(f"After initialization:") | ||||
|         _print_model(model, print_settings) | ||||
| 
 | ||||
|     # STEP 2: Updating the model and printing again | ||||
|     optimizer = Adam(0.001) | ||||
|     set_dropout_rate(model, 0.2) | ||||
|     for e in range(3): | ||||
|         Y, get_dX = model.begin_update(_get_docs()) | ||||
|         dY = get_gradient(model, Y) | ||||
|         _ = get_dX(dY) | ||||
|         model.finish_update(optimizer) | ||||
|     if print_settings.get("print_after_training"): | ||||
|         msg.info(f"After training:") | ||||
|         _print_model(model, print_settings) | ||||
| 
 | ||||
|     # STEP 3: the final prediction | ||||
|     prediction = model.predict(_get_docs()) | ||||
|     if print_settings.get("print_prediction"): | ||||
|         msg.info(f"Prediction:", str(prediction)) | ||||
| 
 | ||||
| 
 | ||||
| def get_gradient(model, Y): | ||||
|     goldY = _get_output(model.ops.xp) | ||||
|     return Y - goldY | ||||
| 
 | ||||
| 
 | ||||
| def _sentences(): | ||||
|     return [ | ||||
|         "Apple is looking at buying U.K. startup for $1 billion", | ||||
|         "Autonomous cars shift insurance liability toward manufacturers", | ||||
|         "San Francisco considers banning sidewalk delivery robots", | ||||
|         "London is a big city in the United Kingdom.", | ||||
|     ] | ||||
| 
 | ||||
| 
 | ||||
| def _get_docs(): | ||||
|     nlp = English() | ||||
|     return list(nlp.pipe(_sentences())) | ||||
| 
 | ||||
| 
 | ||||
| def _get_output(xp): | ||||
|     return xp.asarray([xp.asarray([i+10, i+20, i+30], dtype="float32") for i, _ in enumerate(_get_docs())]) | ||||
| 
 | ||||
| 
 | ||||
| def _print_model(model, print_settings): | ||||
|     layers = print_settings.get("layers", "") | ||||
|     parameters = print_settings.get("parameters", False) | ||||
|     dimensions = print_settings.get("dimensions", False) | ||||
|     gradients = print_settings.get("gradients", False) | ||||
|     attributes = print_settings.get("attributes", False) | ||||
| 
 | ||||
|     for i, node in enumerate(model.walk()): | ||||
|         if not layers or i in layers: | ||||
|             msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'") | ||||
| 
 | ||||
|             if dimensions: | ||||
|                 for name in node.dim_names: | ||||
|                     if node.has_dim(name): | ||||
|                         msg.info(f" - dim {name}: {node.get_dim(name)}") | ||||
|                     else: | ||||
|                         msg.info(f" - dim {name}: {node.has_dim(name)}") | ||||
| 
 | ||||
|             if parameters: | ||||
|                 for name in node.param_names: | ||||
|                     if node.has_param(name): | ||||
|                         print_value = _print_matrix(node.get_param(name)) | ||||
|                         msg.info(f" - param {name}: {print_value}") | ||||
|                     else: | ||||
|                         msg.info(f" - param {name}: {node.has_param(name)}") | ||||
|             if gradients: | ||||
|                 for name in node.param_names: | ||||
|                     if node.has_grad(name): | ||||
|                         print_value = _print_matrix(node.get_grad(name)) | ||||
|                         msg.info(f" - grad {name}: {print_value}") | ||||
|                     else: | ||||
|                         msg.info(f" - grad {name}: {node.has_grad(name)}") | ||||
|             if attributes: | ||||
|                 attrs = node.attrs | ||||
|                 for name, value in attrs.items(): | ||||
|                     msg.info(f" - attr {name}: {value}") | ||||
| 
 | ||||
| 
 | ||||
| def _print_matrix(value): | ||||
|     if value is None or isinstance(value, bool): | ||||
|         return value | ||||
|     result = str(value.shape) + " - sample: " | ||||
|     sample_matrix = value | ||||
|     for d in range(value.ndim-1): | ||||
|         sample_matrix = sample_matrix[0] | ||||
|     sample_matrix = sample_matrix[0:5] | ||||
|     result = result + str(sample_matrix) | ||||
|     return result | ||||
|  | @ -1,4 +1,4 @@ | |||
| from typing import Optional, Sequence, Union | ||||
| from typing import Optional, Sequence | ||||
| import requests | ||||
| import sys | ||||
| from wasabi import msg | ||||
|  | @ -8,6 +8,23 @@ from ._app import app, Arg, Opt | |||
| from .. import about | ||||
| from ..util import is_package, get_base_version, run_command | ||||
| 
 | ||||
| # These are the old shortcuts we previously supported in spacy download. As of | ||||
| # v3, shortcuts are deprecated so we're not expecting to add anything to this | ||||
| # list. It only exists to show users warnings. | ||||
| OLD_SHORTCUTS = { | ||||
|     "en": "en_core_web_sm", | ||||
|     "de": "de_core_news_sm", | ||||
|     "es": "es_core_news_sm", | ||||
|     "pt": "pt_core_news_sm", | ||||
|     "fr": "fr_core_news_sm", | ||||
|     "it": "it_core_news_sm", | ||||
|     "nl": "nl_core_news_sm", | ||||
|     "el": "el_core_news_sm", | ||||
|     "nb": "nb_core_news_sm", | ||||
|     "lt": "lt_core_news_sm", | ||||
|     "xx": "xx_ent_wiki_sm", | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|     "download", | ||||
|  | @ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None: | |||
|         version = components[-1] | ||||
|         download_model(dl_tpl.format(m=model_name, v=version), pip_args) | ||||
|     else: | ||||
|         shortcuts = get_json(about.__shortcuts__, "available shortcuts") | ||||
|         model_name = shortcuts.get(model, model) | ||||
|         model_name = model | ||||
|         if model in OLD_SHORTCUTS: | ||||
|             msg.warn( | ||||
|                 f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. " | ||||
|                 f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead." | ||||
|             ) | ||||
|             model_name = OLD_SHORTCUTS[model] | ||||
|         compatibility = get_compatibility() | ||||
|         version = get_version(model_name, compatibility) | ||||
|         download_model(dl_tpl.format(m=model_name, v=version), pip_args) | ||||
|  | @ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None: | |||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def get_json(url: str, desc: str) -> Union[dict, list]: | ||||
|     r = requests.get(url) | ||||
| def get_compatibility() -> dict: | ||||
|     version = get_base_version(about.__version__) | ||||
|     r = requests.get(about.__compatibility__) | ||||
|     if r.status_code != 200: | ||||
|         msg.fail( | ||||
|             f"Server error ({r.status_code})", | ||||
|             f"Couldn't fetch {desc}. Please find a model for your spaCy " | ||||
|             f"Couldn't fetch compatibility table. Please find a model for your spaCy " | ||||
|             f"installation (v{about.__version__}), and download it manually. " | ||||
|             f"For more details, see the documentation: " | ||||
|             f"https://spacy.io/usage/models", | ||||
|             exits=1, | ||||
|         ) | ||||
|     return r.json() | ||||
| 
 | ||||
| 
 | ||||
| def get_compatibility() -> dict: | ||||
|     version = get_base_version(about.__version__) | ||||
|     comp_table = get_json(about.__compatibility__, "compatibility table") | ||||
|     comp_table = r.json() | ||||
|     comp = comp_table["spacy"] | ||||
|     if version not in comp: | ||||
|         msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) | ||||
|  |  | |||
|  | @ -1,708 +0,0 @@ | |||
| from typing import List, Dict, Any, Optional, Sequence | ||||
| import typer | ||||
| import srsly | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import subprocess | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import sys | ||||
| import requests | ||||
| import tqdm | ||||
| 
 | ||||
| from ._app import app, Arg, Opt, COMMAND, NAME | ||||
| from .. import about | ||||
| from ..schemas import ProjectConfigSchema, validate | ||||
| from ..util import ensure_path, run_command, make_tempdir, working_dir | ||||
| from ..util import get_hash, get_checksum, split_command | ||||
| 
 | ||||
| 
 | ||||
| CONFIG_FILE = "project.yml" | ||||
| DVC_CONFIG = "dvc.yaml" | ||||
| DVC_DIR = ".dvc" | ||||
| DIRS = [ | ||||
|     "assets", | ||||
|     "metas", | ||||
|     "configs", | ||||
|     "packages", | ||||
|     "metrics", | ||||
|     "scripts", | ||||
|     "notebooks", | ||||
|     "training", | ||||
|     "corpus", | ||||
| ] | ||||
| CACHES = [ | ||||
|     Path.home() / ".torch", | ||||
|     Path.home() / ".caches" / "torch", | ||||
|     os.environ.get("TORCH_HOME"), | ||||
|     Path.home() / ".keras", | ||||
| ] | ||||
| DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit | ||||
| # it directly and edit the project.yml instead and re-run the project.""" | ||||
| CLI_HELP = f"""Command-line interface for spaCy projects and working with project | ||||
| templates. You'd typically start by cloning a project template to a local | ||||
| directory and fetching its assets like datasets etc. See the project's | ||||
| {CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data | ||||
| Version Control) to manage input and output files and to ensure steps are only | ||||
| re-run if their inputs change. | ||||
| """ | ||||
| 
 | ||||
| project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.callback(invoke_without_command=True) | ||||
| def callback(ctx: typer.Context): | ||||
|     """This runs before every project command and ensures DVC is installed.""" | ||||
|     ensure_dvc() | ||||
| 
 | ||||
| 
 | ||||
| ################ | ||||
| # CLI COMMANDS # | ||||
| ################ | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("clone") | ||||
| def project_clone_cli( | ||||
|     # fmt: off | ||||
|     name: str = Arg(..., help="The name of the template to fetch"), | ||||
|     dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), | ||||
|     repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), | ||||
|     git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), | ||||
|     no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Clone a project template from a repository. Calls into "git" and will | ||||
|     only download the files from the given subdirectory. The GitHub repo | ||||
|     defaults to the official spaCy template repo, but can be customized | ||||
|     (including using a private repo). Setting the --git flag will also | ||||
|     initialize the project directory as a Git repo. If the project is intended | ||||
|     to be a Git repo, it should be initialized with Git first, before | ||||
|     initializing DVC (Data Version Control). This allows DVC to integrate with | ||||
|     Git. | ||||
|     """ | ||||
|     if dest == Path.cwd(): | ||||
|         dest = dest / name | ||||
|     project_clone(name, dest, repo=repo, git=git, no_init=no_init) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("init") | ||||
| def project_init_cli( | ||||
|     # fmt: off | ||||
|     path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force initiziation"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Initialize a project directory with DVC and optionally Git. This should | ||||
|     typically be taken care of automatically when you run the "project clone" | ||||
|     command, but you can also run it separately. If the project is intended to | ||||
|     be a Git repo, it should be initialized with Git first, before initializing | ||||
|     DVC. This allows DVC to integrate with Git. | ||||
|     """ | ||||
|     project_init(path, git=git, force=force, silent=True) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("assets") | ||||
| def project_assets_cli( | ||||
|     # fmt: off | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Use DVC (Data Version Control) to fetch project assets. Assets are | ||||
|     defined in the "assets" section of the project config. If possible, DVC | ||||
|     will try to track the files so you can pull changes from upstream. It will | ||||
|     also try and store the checksum so the assets are versioned. If the file | ||||
|     can't be tracked or checked, it will be downloaded without DVC. If a checksum | ||||
|     is provided in the project config, the file is only downloaded if no local | ||||
|     file with the same checksum exists. | ||||
|     """ | ||||
|     project_assets(project_dir) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command( | ||||
|     "run-all", | ||||
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, | ||||
| ) | ||||
| def project_run_all_cli( | ||||
|     # fmt: off | ||||
|     ctx: typer.Context, | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Run all commands defined in the project. This command will use DVC and | ||||
|     the defined outputs and dependencies in the project config to determine | ||||
|     which steps need to be re-run and where to start. This means you're only | ||||
|     re-generating data if the inputs have changed. | ||||
| 
 | ||||
|     This command calls into "dvc repro" and all additional arguments are passed | ||||
|     to the "dvc repro" command: https://dvc.org/doc/command-reference/repro | ||||
|     """ | ||||
|     if show_help: | ||||
|         print_run_help(project_dir) | ||||
|     else: | ||||
|         project_run_all(project_dir, *ctx.args) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command( | ||||
|     "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, | ||||
| ) | ||||
| def project_run_cli( | ||||
|     # fmt: off | ||||
|     ctx: typer.Context, | ||||
|     subcommand: str = Arg(None, help="Name of command defined in project config"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Run a named script defined in the project config. If the command is | ||||
|     part of the default pipeline defined in the "run" section, DVC is used to | ||||
|     determine whether the step should re-run if its inputs have changed, or | ||||
|     whether everything is up to date. If the script is not part of the default | ||||
|     pipeline, it will be called separately without DVC. | ||||
| 
 | ||||
|     If DVC is used, the command calls into "dvc repro" and all additional | ||||
|     arguments are passed to the "dvc repro" command: | ||||
|     https://dvc.org/doc/command-reference/repro | ||||
|     """ | ||||
|     if show_help or not subcommand: | ||||
|         print_run_help(project_dir, subcommand) | ||||
|     else: | ||||
|         project_run(project_dir, subcommand, *ctx.args) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("exec", hidden=True) | ||||
| def project_exec_cli( | ||||
|     # fmt: off | ||||
|     subcommand: str = Arg(..., help="Name of command defined in project config"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Execute a command defined in the project config. This CLI command is | ||||
|     only called internally in auto-generated DVC pipelines, as a shortcut for | ||||
|     multi-step commands in the project config. You typically shouldn't have to | ||||
|     call it yourself. To run a command, call "run" or "run-all". | ||||
|     """ | ||||
|     project_exec(project_dir, subcommand) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("update-dvc") | ||||
| def project_update_dvc_cli( | ||||
|     # fmt: off | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Update the auto-generated DVC config file. Uses the steps defined in the | ||||
|     "run" section of the project config. This typically happens automatically | ||||
|     when running a command, but can also be triggered manually if needed. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) | ||||
|     if updated: | ||||
|         msg.good(f"Updated DVC config from {CONFIG_FILE}") | ||||
|     else: | ||||
|         msg.info(f"No changes found in {CONFIG_FILE}, no update needed") | ||||
| 
 | ||||
| 
 | ||||
| app.add_typer(project_cli, name="project") | ||||
| 
 | ||||
| 
 | ||||
| ################# | ||||
| # CLI FUNCTIONS # | ||||
| ################# | ||||
| 
 | ||||
| 
 | ||||
| def project_clone( | ||||
|     name: str, | ||||
|     dest: Path, | ||||
|     *, | ||||
|     repo: str = about.__projects__, | ||||
|     git: bool = False, | ||||
|     no_init: bool = False, | ||||
| ) -> None: | ||||
|     """Clone a project template from a repository. | ||||
| 
 | ||||
|     name (str): Name of subdirectory to clone. | ||||
|     dest (Path): Destination path of cloned project. | ||||
|     repo (str): URL of Git repo containing project templates. | ||||
|     git (bool): Initialize project as Git repo. Should be set to True if project | ||||
|         is intended as a repo, since it will allow DVC to integrate with Git. | ||||
|     no_init (bool): Don't initialize DVC and Git automatically. If True, the | ||||
|         "init" command or "git init" and "dvc init" need to be run manually. | ||||
|     """ | ||||
|     dest = ensure_path(dest) | ||||
|     check_clone(name, dest, repo) | ||||
|     project_dir = dest.resolve() | ||||
|     # We're using Git and sparse checkout to only clone the files we need | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" | ||||
|         try: | ||||
|             run_command(cmd) | ||||
|         except SystemExit: | ||||
|             err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." | ||||
|             msg.fail(err) | ||||
|         with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: | ||||
|             f.write(name) | ||||
|         try: | ||||
|             run_command(["git", "-C", str(tmp_dir), "fetch"]) | ||||
|             run_command(["git", "-C", str(tmp_dir), "checkout"]) | ||||
|         except SystemExit: | ||||
|             err = f"Could not clone '{name}' in the repo '{repo}'." | ||||
|             msg.fail(err) | ||||
|         shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) | ||||
|     msg.good(f"Cloned project '{name}' from {repo} into {project_dir}") | ||||
|     for sub_dir in DIRS: | ||||
|         dir_path = project_dir / sub_dir | ||||
|         if not dir_path.exists(): | ||||
|             dir_path.mkdir(parents=True) | ||||
|     if not no_init: | ||||
|         project_init(project_dir, git=git, force=True, silent=True) | ||||
|     msg.good(f"Your project is now ready!", dest) | ||||
|     print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def project_init( | ||||
|     project_dir: Path, | ||||
|     *, | ||||
|     git: bool = False, | ||||
|     force: bool = False, | ||||
|     silent: bool = False, | ||||
|     analytics: bool = False, | ||||
| ): | ||||
|     """Initialize a project as a DVC and (optionally) as a Git repo. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     git (bool): Also call "git init" to initialize directory as a Git repo. | ||||
|     silent (bool): Don't print any output (via DVC). | ||||
|     analytics (bool): Opt-in to DVC analytics (defaults to False). | ||||
|     """ | ||||
|     with working_dir(project_dir) as cwd: | ||||
|         if git: | ||||
|             run_command(["git", "init"]) | ||||
|         init_cmd = ["dvc", "init"] | ||||
|         if silent: | ||||
|             init_cmd.append("--quiet") | ||||
|         if not git: | ||||
|             init_cmd.append("--no-scm") | ||||
|         if force: | ||||
|             init_cmd.append("--force") | ||||
|         run_command(init_cmd) | ||||
|         # We don't want to have analytics on by default – our users should | ||||
|         # opt-in explicitly. If they want it, they can always enable it. | ||||
|         if not analytics: | ||||
|             run_command(["dvc", "config", "core.analytics", "false"]) | ||||
|         # Remove unused and confusing plot templates from .dvc directory | ||||
|         # TODO: maybe we shouldn't do this, but it's otherwise super confusing | ||||
|         # once you commit your changes via Git and it creates a bunch of files | ||||
|         # that have no purpose | ||||
|         plots_dir = cwd / DVC_DIR / "plots" | ||||
|         if plots_dir.exists(): | ||||
|             shutil.rmtree(str(plots_dir)) | ||||
|         config = load_project_config(cwd) | ||||
|         setup_check_dvc(cwd, config) | ||||
| 
 | ||||
| 
 | ||||
| def project_assets(project_dir: Path) -> None: | ||||
|     """Fetch assets for a project using DVC if possible. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     """ | ||||
|     project_path = ensure_path(project_dir) | ||||
|     config = load_project_config(project_path) | ||||
|     setup_check_dvc(project_path, config) | ||||
|     assets = config.get("assets", {}) | ||||
|     if not assets: | ||||
|         msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) | ||||
|     msg.info(f"Fetching {len(assets)} asset(s)") | ||||
|     variables = config.get("variables", {}) | ||||
|     fetched_assets = [] | ||||
|     for asset in assets: | ||||
|         url = asset["url"].format(**variables) | ||||
|         dest = asset["dest"].format(**variables) | ||||
|         fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) | ||||
|         if fetched_path: | ||||
|             fetched_assets.append(str(fetched_path)) | ||||
|     if fetched_assets: | ||||
|         with working_dir(project_path): | ||||
|             run_command(["dvc", "add", *fetched_assets, "--external"]) | ||||
| 
 | ||||
| 
 | ||||
| def fetch_asset( | ||||
|     project_path: Path, url: str, dest: Path, checksum: Optional[str] = None | ||||
| ) -> Optional[Path]: | ||||
|     """Fetch an asset from a given URL or path. Will try to import the file | ||||
|     using DVC's import-url if possible (fully tracked and versioned) and falls | ||||
|     back to get-url (versioned) and a non-DVC download if necessary. If a | ||||
|     checksum is provided and a local file exists, it's only re-downloaded if the | ||||
|     checksum doesn't match. | ||||
| 
 | ||||
|     project_path (Path): Path to project directory. | ||||
|     url (str): URL or path to asset. | ||||
|     checksum (Optional[str]): Optional expected checksum of local file. | ||||
|     RETURNS (Optional[Path]): The path to the fetched asset or None if fetching | ||||
|         the asset failed. | ||||
|     """ | ||||
|     url = convert_asset_url(url) | ||||
|     dest_path = (project_path / dest).resolve() | ||||
|     if dest_path.exists() and checksum: | ||||
|         # If there's already a file, check for checksum | ||||
|         # TODO: add support for caches (dvc import-url with local path) | ||||
|         if checksum == get_checksum(dest_path): | ||||
|             msg.good(f"Skipping download with matching checksum: {dest}") | ||||
|             return dest_path | ||||
|     with working_dir(project_path): | ||||
|         try: | ||||
|             # If these fail, we don't want to output an error or info message. | ||||
|             # Try with tracking the source first, then just downloading with | ||||
|             # DVC, then a regular non-DVC download. | ||||
|             try: | ||||
|                 dvc_cmd = ["dvc", "import-url", url, str(dest_path)] | ||||
|                 print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) | ||||
|             except subprocess.CalledProcessError: | ||||
|                 dvc_cmd = ["dvc", "get-url", url, str(dest_path)] | ||||
|                 print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) | ||||
|         except subprocess.CalledProcessError: | ||||
|             try: | ||||
|                 download_file(url, dest_path) | ||||
|             except requests.exceptions.HTTPError as e: | ||||
|                 msg.fail(f"Download failed: {dest}", e) | ||||
|                 return None | ||||
|     if checksum and checksum != get_checksum(dest_path): | ||||
|         msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") | ||||
|     msg.good(f"Fetched asset {dest}") | ||||
|     return dest_path | ||||
| 
 | ||||
| 
 | ||||
| def project_run_all(project_dir: Path, *dvc_args) -> None: | ||||
|     """Run all commands defined in the project using DVC. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     *dvc_args: Other arguments passed to "dvc repro". | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     setup_check_dvc(project_dir, config) | ||||
|     dvc_cmd = ["dvc", "repro", *dvc_args] | ||||
|     with working_dir(project_dir): | ||||
|         run_command(dvc_cmd) | ||||
| 
 | ||||
| 
 | ||||
| def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: | ||||
|     """Simulate a CLI help prompt using the info available in the project config. | ||||
| 
 | ||||
|     project_dir (Path): The project directory. | ||||
|     subcommand (Optional[str]): The subcommand or None. If a subcommand is | ||||
|         provided, the subcommand help is shown. Otherwise, the top-level help | ||||
|         and a list of available commands is printed. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     setup_check_dvc(project_dir, config) | ||||
|     config_commands = config.get("commands", []) | ||||
|     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||
|     if subcommand: | ||||
|         validate_subcommand(commands.keys(), subcommand) | ||||
|         print(f"Usage: {COMMAND} project run {subcommand} {project_dir}") | ||||
|         help_text = commands[subcommand].get("help") | ||||
|         if help_text: | ||||
|             msg.text(f"\n{help_text}\n") | ||||
|     else: | ||||
|         print(f"\nAvailable commands in {CONFIG_FILE}") | ||||
|         print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") | ||||
|         msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) | ||||
|         msg.text("Run all commands defined in the 'run' block of the project config:") | ||||
|         print(f"{COMMAND} project run-all {project_dir}") | ||||
| 
 | ||||
| 
 | ||||
| def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: | ||||
|     """Run a named script defined in the project config. If the script is part | ||||
|     of the default pipeline (defined in the "run" section), DVC is used to | ||||
|     execute the command, so it can determine whether to rerun it. It then | ||||
|     calls into "exec" to execute it. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     subcommand (str): Name of command to run. | ||||
|     *dvc_args: Other arguments passed to "dvc repro". | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     setup_check_dvc(project_dir, config) | ||||
|     config_commands = config.get("commands", []) | ||||
|     variables = config.get("variables", {}) | ||||
|     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||
|     validate_subcommand(commands.keys(), subcommand) | ||||
|     if subcommand in config.get("run", []): | ||||
|         # This is one of the pipeline commands tracked in DVC | ||||
|         dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] | ||||
|         with working_dir(project_dir): | ||||
|             run_command(dvc_cmd) | ||||
|     else: | ||||
|         cmd = commands[subcommand] | ||||
|         # Deps in non-DVC commands aren't tracked, but if they're defined, | ||||
|         # make sure they exist before running the command | ||||
|         for dep in cmd.get("deps", []): | ||||
|             if not (project_dir / dep).exists(): | ||||
|                 err = f"Missing dependency specified by command '{subcommand}': {dep}" | ||||
|                 msg.fail(err, exits=1) | ||||
|         with working_dir(project_dir): | ||||
|             run_commands(cmd["script"], variables) | ||||
| 
 | ||||
| 
 | ||||
| def project_exec(project_dir: Path, subcommand: str): | ||||
|     """Execute a command defined in the project config. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     subcommand (str): Name of command to run. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     config_commands = config.get("commands", []) | ||||
|     variables = config.get("variables", {}) | ||||
|     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||
|     with working_dir(project_dir): | ||||
|         run_commands(commands[subcommand]["script"], variables) | ||||
| 
 | ||||
| 
 | ||||
| ########### | ||||
| # HELPERS # | ||||
| ########### | ||||
| 
 | ||||
| 
 | ||||
| def load_project_config(path: Path) -> Dict[str, Any]: | ||||
|     """Load the project config file from a directory and validate it. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     RETURNS (Dict[str, Any]): The loaded project config. | ||||
|     """ | ||||
|     config_path = path / CONFIG_FILE | ||||
|     if not config_path.exists(): | ||||
|         msg.fail("Can't find project config", config_path, exits=1) | ||||
|     invalid_err = f"Invalid project config in {CONFIG_FILE}" | ||||
|     try: | ||||
|         config = srsly.read_yaml(config_path) | ||||
|     except ValueError as e: | ||||
|         msg.fail(invalid_err, e, exits=1) | ||||
|     errors = validate(ProjectConfigSchema, config) | ||||
|     if errors: | ||||
|         msg.fail(invalid_err, "\n".join(errors), exits=1) | ||||
|     return config | ||||
| 
 | ||||
| 
 | ||||
| def update_dvc_config( | ||||
|     path: Path, | ||||
|     config: Dict[str, Any], | ||||
|     verbose: bool = False, | ||||
|     silent: bool = False, | ||||
|     force: bool = False, | ||||
| ) -> bool: | ||||
|     """Re-run the DVC commands in dry mode and update dvc.yaml file in the | ||||
|     project directory. The file is auto-generated based on the config. The | ||||
|     first line of the auto-generated file specifies the hash of the config | ||||
|     dict, so if any of the config values change, the DVC config is regenerated. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     config (Dict[str, Any]): The loaded project config. | ||||
|     verbose (bool): Whether to print additional info (via DVC). | ||||
|     silent (bool): Don't output anything (via DVC). | ||||
|     force (bool): Force update, even if hashes match. | ||||
|     RETURNS (bool): Whether the DVC config file was updated. | ||||
|     """ | ||||
|     config_hash = get_hash(config) | ||||
|     path = path.resolve() | ||||
|     dvc_config_path = path / DVC_CONFIG | ||||
|     if dvc_config_path.exists(): | ||||
|         # Check if the file was generated using the current config, if not, redo | ||||
|         with dvc_config_path.open("r", encoding="utf8") as f: | ||||
|             ref_hash = f.readline().strip().replace("# ", "") | ||||
|         if ref_hash == config_hash and not force: | ||||
|             return False  # Nothing has changed in project config, don't need to update | ||||
|         dvc_config_path.unlink() | ||||
|     variables = config.get("variables", {}) | ||||
|     commands = [] | ||||
|     # We only want to include commands that are part of the main list of "run" | ||||
|     # commands in project.yml and should be run in sequence | ||||
|     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||
|     for name in config.get("run", []): | ||||
|         validate_subcommand(config_commands.keys(), name) | ||||
|         command = config_commands[name] | ||||
|         deps = command.get("deps", []) | ||||
|         outputs = command.get("outputs", []) | ||||
|         outputs_no_cache = command.get("outputs_no_cache", []) | ||||
|         if not deps and not outputs and not outputs_no_cache: | ||||
|             continue | ||||
|         # Default to the working dir as the project path since dvc.yaml is auto-generated | ||||
|         # and we don't want arbitrary paths in there | ||||
|         project_cmd = ["python", "-m", NAME, "project", "exec", name] | ||||
|         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] | ||||
|         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] | ||||
|         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] | ||||
|         dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] | ||||
|         if verbose: | ||||
|             dvc_cmd.append("--verbose") | ||||
|         if silent: | ||||
|             dvc_cmd.append("--quiet") | ||||
|         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] | ||||
|         commands.append(" ".join(full_cmd)) | ||||
|     with working_dir(path): | ||||
|         run_commands(commands, variables, silent=True) | ||||
|     with dvc_config_path.open("r+", encoding="utf8") as f: | ||||
|         content = f.read() | ||||
|         f.seek(0, 0) | ||||
|         f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") | ||||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| def ensure_dvc() -> None: | ||||
|     """Ensure that the "dvc" command is available and show an error if not.""" | ||||
|     try: | ||||
|         subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) | ||||
|     except Exception: | ||||
|         msg.fail( | ||||
|             "spaCy projects require DVC (Data Version Control) and the 'dvc' command", | ||||
|             "You can install the Python package from pip (pip install dvc) or " | ||||
|             "conda (conda install -c conda-forge dvc). For more details, see the " | ||||
|             "documentation: https://dvc.org/doc/install", | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: | ||||
|     """Check that the project is set up correctly with DVC and update its | ||||
|     config if needed. Will raise an error if the project is not an initialized | ||||
|     DVC project. | ||||
| 
 | ||||
|     project_dir (Path): The path to the project directory. | ||||
|     config (Dict[str, Any]): The loaded project config. | ||||
|     """ | ||||
|     if not project_dir.exists(): | ||||
|         msg.fail(f"Can't find project directory: {project_dir}") | ||||
|     if not (project_dir / ".dvc").exists(): | ||||
|         msg.fail( | ||||
|             "Project not initialized as a DVC project.", | ||||
|             f"Make sure that the project template was cloned correctly. To " | ||||
|             f"initialize the project directory manually, you can run: " | ||||
|             f"{COMMAND} project init {project_dir}", | ||||
|             exits=1, | ||||
|         ) | ||||
|     with msg.loading("Updating DVC config..."): | ||||
|         updated = update_dvc_config(project_dir, config, silent=True) | ||||
|     if updated: | ||||
|         msg.good(f"Updated DVC config from changed {CONFIG_FILE}") | ||||
| 
 | ||||
| 
 | ||||
| def run_commands( | ||||
|     commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False | ||||
| ) -> None: | ||||
|     """Run a sequence of commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands. | ||||
|     variables (Dict[str, str]): Dictionary of variable names, mapped to their | ||||
|         values. Will be used to substitute format string variables in the | ||||
|         commands. | ||||
|     silent (bool): Don't print the commands. | ||||
|     """ | ||||
|     for command in commands: | ||||
|         # Substitute variables, e.g. "./{NAME}.json" | ||||
|         command = command.format(**variables) | ||||
|         command = split_command(command) | ||||
|         # Not sure if this is needed or a good idea. Motivation: users may often | ||||
|         # use commands in their config that reference "python" and we want to | ||||
|         # make sure that it's always executing the same Python that spaCy is | ||||
|         # executed with and the pip in the same env, not some other Python/pip. | ||||
|         # Also ensures cross-compatibility if user 1 writes "python3" (because | ||||
|         # that's how it's set up on their system), and user 2 without the | ||||
|         # shortcut tries to re-run the command. | ||||
|         if len(command) and command[0] in ("python", "python3"): | ||||
|             command[0] = sys.executable | ||||
|         elif len(command) and command[0] in ("pip", "pip3"): | ||||
|             command = [sys.executable, "-m", "pip", *command[1:]] | ||||
|         if not silent: | ||||
|             print(f"Running command: {' '.join(command)}") | ||||
|         run_command(command) | ||||
| 
 | ||||
| 
 | ||||
| def convert_asset_url(url: str) -> str: | ||||
|     """Check and convert the asset URL if needed. | ||||
| 
 | ||||
|     url (str): The asset URL. | ||||
|     RETURNS (str): The converted URL. | ||||
|     """ | ||||
|     # If the asset URL is a regular GitHub URL it's likely a mistake | ||||
|     if re.match("(http(s?)):\/\/github.com", url): | ||||
|         converted = url.replace("github.com", "raw.githubusercontent.com") | ||||
|         converted = re.sub(r"/(tree|blob)/", "/", converted) | ||||
|         msg.warn( | ||||
|             "Downloading from a regular GitHub URL. This will only download " | ||||
|             "the source of the page, not the actual file. Converting the URL " | ||||
|             "to a raw URL.", | ||||
|             converted, | ||||
|         ) | ||||
|         return converted | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def check_clone(name: str, dest: Path, repo: str) -> None: | ||||
|     """Check and validate that the destination path can be used to clone. Will | ||||
|     check that Git is available and that the destination path is suitable. | ||||
| 
 | ||||
|     name (str): Name of the directory to clone from the repo. | ||||
|     dest (Path): Local destination of cloned directory. | ||||
|     repo (str): URL of the repo to clone from. | ||||
|     """ | ||||
|     try: | ||||
|         subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) | ||||
|     except Exception: | ||||
|         msg.fail( | ||||
|             f"Cloning spaCy project templates requires Git and the 'git' command. ", | ||||
|             f"To clone a project without Git, copy the files from the '{name}' " | ||||
|             f"directory in the {repo} to {dest} manually and then run:", | ||||
|             f"{COMMAND} project init {dest}", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if not dest: | ||||
|         msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) | ||||
|     if dest.exists(): | ||||
|         # Directory already exists (not allowed, clone needs to create it) | ||||
|         msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) | ||||
|     if not dest.parent.exists(): | ||||
|         # We're not creating parents, parent dir should exist | ||||
|         msg.fail( | ||||
|             f"Can't clone project, parent directory doesn't exist: {dest.parent}", | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: | ||||
|     """Check that a subcommand is valid and defined. Raises an error otherwise. | ||||
| 
 | ||||
|     commands (Sequence[str]): The available commands. | ||||
|     subcommand (str): The subcommand. | ||||
|     """ | ||||
|     if subcommand not in commands: | ||||
|         msg.fail( | ||||
|             f"Can't find command '{subcommand}' in {CONFIG_FILE}. " | ||||
|             f"Available commands: {', '.join(commands)}", | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: | ||||
|     """Download a file using requests. | ||||
| 
 | ||||
|     url (str): The URL of the file. | ||||
|     dest (Path): The destination path. | ||||
|     chunk_size (int): The size of chunks to read/write. | ||||
|     """ | ||||
|     response = requests.get(url, stream=True) | ||||
|     response.raise_for_status() | ||||
|     total = int(response.headers.get("content-length", 0)) | ||||
|     progress_settings = { | ||||
|         "total": total, | ||||
|         "unit": "iB", | ||||
|         "unit_scale": True, | ||||
|         "unit_divisor": chunk_size, | ||||
|         "leave": False, | ||||
|     } | ||||
|     with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: | ||||
|         for data in response.iter_content(chunk_size=chunk_size): | ||||
|             size = f.write(data) | ||||
|             bar.update(size) | ||||
							
								
								
									
										0
									
								
								spacy/cli/project/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/cli/project/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										158
									
								
								spacy/cli/project/assets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								spacy/cli/project/assets.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,158 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import requests | ||||
| import tqdm | ||||
| import re | ||||
| import shutil | ||||
| 
 | ||||
| from ...util import ensure_path, working_dir | ||||
| from .._app import project_cli, Arg | ||||
| from .util import PROJECT_FILE, load_project_config, get_checksum | ||||
| 
 | ||||
| 
 | ||||
| # TODO: find a solution for caches | ||||
| # CACHES = [ | ||||
| #     Path.home() / ".torch", | ||||
| #     Path.home() / ".caches" / "torch", | ||||
| #     os.environ.get("TORCH_HOME"), | ||||
| #     Path.home() / ".keras", | ||||
| # ] | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("assets") | ||||
| def project_assets_cli( | ||||
|     # fmt: off | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Fetch project assets like datasets and pretrained weights. Assets are | ||||
|     defined in the "assets" section of the project.yml. If a checksum is | ||||
|     provided in the project.yml, the file is only downloaded if no local file | ||||
|     with the same checksum exists. | ||||
|     """ | ||||
|     project_assets(project_dir) | ||||
| 
 | ||||
| 
 | ||||
| def project_assets(project_dir: Path) -> None: | ||||
|     """Fetch assets for a project using DVC if possible. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     """ | ||||
|     project_path = ensure_path(project_dir) | ||||
|     config = load_project_config(project_path) | ||||
|     assets = config.get("assets", {}) | ||||
|     if not assets: | ||||
|         msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) | ||||
|     msg.info(f"Fetching {len(assets)} asset(s)") | ||||
|     variables = config.get("variables", {}) | ||||
|     for asset in assets: | ||||
|         dest = asset["dest"].format(**variables) | ||||
|         url = asset.get("url") | ||||
|         checksum = asset.get("checksum") | ||||
|         if not url: | ||||
|             # project.yml defines asset without URL that the user has to place | ||||
|             check_private_asset(dest, checksum) | ||||
|             continue | ||||
|         url = url.format(**variables) | ||||
|         fetch_asset(project_path, url, dest, checksum) | ||||
| 
 | ||||
| 
 | ||||
| def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: | ||||
|     """Check and validate assets without a URL (private assets that the user | ||||
|     has to provide themselves) and give feedback about the checksum. | ||||
| 
 | ||||
|     dest (Path): Desintation path of the asset. | ||||
|     checksum (Optional[str]): Optional checksum of the expected file. | ||||
|     """ | ||||
|     if not Path(dest).exists(): | ||||
|         err = f"No URL provided for asset. You need to add this file yourself: {dest}" | ||||
|         msg.warn(err) | ||||
|     else: | ||||
|         if checksum and checksum == get_checksum(dest): | ||||
|             msg.good(f"Asset exists with matching checksum: {dest}") | ||||
|         else: | ||||
|             msg.fail(f"Asset available but with incorrect checksum: {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def fetch_asset( | ||||
|     project_path: Path, url: str, dest: Path, checksum: Optional[str] = None | ||||
| ) -> None: | ||||
|     """Fetch an asset from a given URL or path. If a checksum is provided and a | ||||
|     local file exists, it's only re-downloaded if the checksum doesn't match. | ||||
| 
 | ||||
|     project_path (Path): Path to project directory. | ||||
|     url (str): URL or path to asset. | ||||
|     checksum (Optional[str]): Optional expected checksum of local file. | ||||
|     RETURNS (Optional[Path]): The path to the fetched asset or None if fetching | ||||
|         the asset failed. | ||||
|     """ | ||||
|     # TODO: add support for caches | ||||
|     dest_path = (project_path / dest).resolve() | ||||
|     if dest_path.exists() and checksum: | ||||
|         # If there's already a file, check for checksum | ||||
|         if checksum == get_checksum(dest_path): | ||||
|             msg.good(f"Skipping download with matching checksum: {dest}") | ||||
|             return dest_path | ||||
|     # We might as well support the user here and create parent directories in | ||||
|     # case the asset dir isn't listed as a dir to create in the project.yml | ||||
|     if not dest_path.parent.exists(): | ||||
|         dest_path.parent.mkdir(parents=True) | ||||
|     with working_dir(project_path): | ||||
|         url = convert_asset_url(url) | ||||
|         try: | ||||
|             download_file(url, dest_path) | ||||
|             msg.good(f"Downloaded asset {dest}") | ||||
|         except requests.exceptions.RequestException as e: | ||||
|             if Path(url).exists() and Path(url).is_file(): | ||||
|                 # If it's a local file, copy to destination | ||||
|                 shutil.copy(url, str(dest_path)) | ||||
|                 msg.good(f"Copied local asset {dest}") | ||||
|             else: | ||||
|                 msg.fail(f"Download failed: {dest}", e) | ||||
|                 return | ||||
|     if checksum and checksum != get_checksum(dest_path): | ||||
|         msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def convert_asset_url(url: str) -> str: | ||||
|     """Check and convert the asset URL if needed. | ||||
| 
 | ||||
|     url (str): The asset URL. | ||||
|     RETURNS (str): The converted URL. | ||||
|     """ | ||||
|     # If the asset URL is a regular GitHub URL it's likely a mistake | ||||
|     if re.match(r"(http(s?)):\/\/github.com", url): | ||||
|         converted = url.replace("github.com", "raw.githubusercontent.com") | ||||
|         converted = re.sub(r"/(tree|blob)/", "/", converted) | ||||
|         msg.warn( | ||||
|             "Downloading from a regular GitHub URL. This will only download " | ||||
|             "the source of the page, not the actual file. Converting the URL " | ||||
|             "to a raw URL.", | ||||
|             converted, | ||||
|         ) | ||||
|         return converted | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: | ||||
|     """Download a file using requests. | ||||
| 
 | ||||
|     url (str): The URL of the file. | ||||
|     dest (Path): The destination path. | ||||
|     chunk_size (int): The size of chunks to read/write. | ||||
|     """ | ||||
|     response = requests.get(url, stream=True) | ||||
|     response.raise_for_status() | ||||
|     total = int(response.headers.get("content-length", 0)) | ||||
|     progress_settings = { | ||||
|         "total": total, | ||||
|         "unit": "iB", | ||||
|         "unit_scale": True, | ||||
|         "unit_divisor": chunk_size, | ||||
|         "leave": False, | ||||
|     } | ||||
|     with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: | ||||
|         for data in response.iter_content(chunk_size=chunk_size): | ||||
|             size = f.write(data) | ||||
|             bar.update(size) | ||||
							
								
								
									
										97
									
								
								spacy/cli/project/clone.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								spacy/cli/project/clone.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,97 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import subprocess | ||||
| import shutil | ||||
| import re | ||||
| 
 | ||||
| from ... import about | ||||
| from ...util import ensure_path, run_command, make_tempdir | ||||
| from .._app import project_cli, Arg, Opt, COMMAND | ||||
| from .util import PROJECT_FILE | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("clone") | ||||
| def project_clone_cli( | ||||
|     # fmt: off | ||||
|     name: str = Arg(..., help="The name of the template to clone"), | ||||
|     dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), | ||||
|     repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Clone a project template from a repository. Calls into "git" and will | ||||
|     only download the files from the given subdirectory. The GitHub repo | ||||
|     defaults to the official spaCy template repo, but can be customized | ||||
|     (including using a private repo). | ||||
|     """ | ||||
|     if dest is None: | ||||
|         dest = Path.cwd() / name | ||||
|     project_clone(name, dest, repo=repo) | ||||
| 
 | ||||
| 
 | ||||
| def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None: | ||||
|     """Clone a project template from a repository. | ||||
| 
 | ||||
|     name (str): Name of subdirectory to clone. | ||||
|     dest (Path): Destination path of cloned project. | ||||
|     repo (str): URL of Git repo containing project templates. | ||||
|     """ | ||||
|     dest = ensure_path(dest) | ||||
|     check_clone(name, dest, repo) | ||||
|     project_dir = dest.resolve() | ||||
|     repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) | ||||
|     # We're using Git and sparse checkout to only clone the files we need | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" | ||||
|         try: | ||||
|             run_command(cmd) | ||||
|         except subprocess.CalledProcessError: | ||||
|             err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." | ||||
|             msg.fail(err) | ||||
|         with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: | ||||
|             f.write(name) | ||||
|         try: | ||||
|             run_command(["git", "-C", str(tmp_dir), "fetch"]) | ||||
|             run_command(["git", "-C", str(tmp_dir), "checkout"]) | ||||
|         except subprocess.CalledProcessError: | ||||
|             err = f"Could not clone '{name}' from repo '{repo_name}'" | ||||
|             msg.fail(err) | ||||
|         # We need Path(name) to make sure we also support subdirectories | ||||
|         shutil.move(str(tmp_dir / Path(name)), str(project_dir)) | ||||
|     msg.good(f"Cloned '{name}' from {repo_name}", project_dir) | ||||
|     if not (project_dir / PROJECT_FILE).exists(): | ||||
|         msg.warn(f"No {PROJECT_FILE} found in directory") | ||||
|     else: | ||||
|         msg.good(f"Your project is now ready!") | ||||
|         print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def check_clone(name: str, dest: Path, repo: str) -> None: | ||||
|     """Check and validate that the destination path can be used to clone. Will | ||||
|     check that Git is available and that the destination path is suitable. | ||||
| 
 | ||||
|     name (str): Name of the directory to clone from the repo. | ||||
|     dest (Path): Local destination of cloned directory. | ||||
|     repo (str): URL of the repo to clone from. | ||||
|     """ | ||||
|     try: | ||||
|         subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) | ||||
|     except Exception: | ||||
|         msg.fail( | ||||
|             f"Cloning spaCy project templates requires Git and the 'git' command. ", | ||||
|             f"To clone a project without Git, copy the files from the '{name}' " | ||||
|             f"directory in the {repo} to {dest} manually and then run:", | ||||
|             f"{COMMAND} project init {dest}", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if not dest: | ||||
|         msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) | ||||
|     if dest.exists(): | ||||
|         # Directory already exists (not allowed, clone needs to create it) | ||||
|         msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) | ||||
|     if not dest.parent.exists(): | ||||
|         # We're not creating parents, parent dir should exist | ||||
|         msg.fail( | ||||
|             f"Can't clone project, parent directory doesn't exist: {dest.parent}", | ||||
|             exits=1, | ||||
|         ) | ||||
							
								
								
									
										208
									
								
								spacy/cli/project/dvc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										208
									
								
								spacy/cli/project/dvc.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,208 @@ | |||
| """This module contains helpers and subcommands for integrating spaCy projects | ||||
| with Data Version Controk (DVC). https://dvc.org""" | ||||
| from typing import Dict, Any, List, Optional | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .util import PROJECT_FILE, load_project_config, get_hash | ||||
| from .._app import project_cli, Arg, Opt, NAME, COMMAND | ||||
| from ...util import working_dir, split_command, join_command, run_command | ||||
| 
 | ||||
| 
 | ||||
| DVC_CONFIG = "dvc.yaml" | ||||
| DVC_DIR = ".dvc" | ||||
| UPDATE_COMMAND = "dvc" | ||||
| DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've | ||||
| # edited your {PROJECT_FILE}, you can regenerate this file by running: | ||||
| # {COMMAND} project {UPDATE_COMMAND}""" | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command(UPDATE_COMMAND) | ||||
| def project_update_dvc_cli( | ||||
|     # fmt: off | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Auto-generate Data Version Control (DVC) config. A DVC | ||||
|     project can only define one pipeline, so you need to specify one workflow | ||||
|     defined in the project.yml. If no workflow is specified, the first defined | ||||
|     workflow is used. The DVC config will only be updated if the project.yml changed. | ||||
|     """ | ||||
|     project_update_dvc(project_dir, workflow, verbose=verbose, force=force) | ||||
| 
 | ||||
| 
 | ||||
| def project_update_dvc( | ||||
|     project_dir: Path, | ||||
|     workflow: Optional[str] = None, | ||||
|     *, | ||||
|     verbose: bool = False, | ||||
|     force: bool = False, | ||||
| ) -> None: | ||||
|     """Update the auto-generated Data Version Control (DVC) config file. A DVC | ||||
|     project can only define one pipeline, so you need to specify one workflow | ||||
|     defined in the project.yml. Will only update the file if the checksum changed. | ||||
| 
 | ||||
|     project_dir (Path): The project directory. | ||||
|     workflow (Optional[str]): Optional name of workflow defined in project.yml. | ||||
|         If not set, the first workflow will be used. | ||||
|     verbose (bool): Print more info. | ||||
|     force (bool): Force update DVC config. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     updated = update_dvc_config( | ||||
|         project_dir, config, workflow, verbose=verbose, force=force | ||||
|     ) | ||||
|     help_msg = "To execute the workflow with DVC, run: dvc repro" | ||||
|     if updated: | ||||
|         msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) | ||||
|     else: | ||||
|         msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg) | ||||
| 
 | ||||
| 
 | ||||
| def update_dvc_config( | ||||
|     path: Path, | ||||
|     config: Dict[str, Any], | ||||
|     workflow: Optional[str] = None, | ||||
|     verbose: bool = False, | ||||
|     silent: bool = False, | ||||
|     force: bool = False, | ||||
| ) -> bool: | ||||
|     """Re-run the DVC commands in dry mode and update dvc.yaml file in the | ||||
|     project directory. The file is auto-generated based on the config. The | ||||
|     first line of the auto-generated file specifies the hash of the config | ||||
|     dict, so if any of the config values change, the DVC config is regenerated. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     config (Dict[str, Any]): The loaded project.yml. | ||||
|     verbose (bool): Whether to print additional info (via DVC). | ||||
|     silent (bool): Don't output anything (via DVC). | ||||
|     force (bool): Force update, even if hashes match. | ||||
|     RETURNS (bool): Whether the DVC config file was updated. | ||||
|     """ | ||||
|     ensure_dvc(path) | ||||
|     workflows = config.get("workflows", {}) | ||||
|     workflow_names = list(workflows.keys()) | ||||
|     check_workflows(workflow_names, workflow) | ||||
|     if not workflow: | ||||
|         workflow = workflow_names[0] | ||||
|     config_hash = get_hash(config) | ||||
|     path = path.resolve() | ||||
|     dvc_config_path = path / DVC_CONFIG | ||||
|     if dvc_config_path.exists(): | ||||
|         # Check if the file was generated using the current config, if not, redo | ||||
|         with dvc_config_path.open("r", encoding="utf8") as f: | ||||
|             ref_hash = f.readline().strip().replace("# ", "") | ||||
|         if ref_hash == config_hash and not force: | ||||
|             return False  # Nothing has changed in project.yml, don't need to update | ||||
|         dvc_config_path.unlink() | ||||
|     variables = config.get("variables", {}) | ||||
|     dvc_commands = [] | ||||
|     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||
|     for name in workflows[workflow]: | ||||
|         command = config_commands[name] | ||||
|         deps = command.get("deps", []) | ||||
|         outputs = command.get("outputs", []) | ||||
|         outputs_no_cache = command.get("outputs_no_cache", []) | ||||
|         if not deps and not outputs and not outputs_no_cache: | ||||
|             continue | ||||
|         # Default to the working dir as the project path since dvc.yaml is auto-generated | ||||
|         # and we don't want arbitrary paths in there | ||||
|         project_cmd = ["python", "-m", NAME, "project", "run", name] | ||||
|         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] | ||||
|         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] | ||||
|         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] | ||||
|         dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] | ||||
|         if command.get("no_skip"): | ||||
|             dvc_cmd.append("--always-changed") | ||||
|         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] | ||||
|         dvc_commands.append(join_command(full_cmd)) | ||||
|     with working_dir(path): | ||||
|         dvc_flags = {"--verbose": verbose, "--quiet": silent} | ||||
|         run_dvc_commands(dvc_commands, variables, flags=dvc_flags) | ||||
|     with dvc_config_path.open("r+", encoding="utf8") as f: | ||||
|         content = f.read() | ||||
|         f.seek(0, 0) | ||||
|         f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") | ||||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| def run_dvc_commands( | ||||
|     commands: List[str] = tuple(), | ||||
|     variables: Dict[str, str] = {}, | ||||
|     flags: Dict[str, bool] = {}, | ||||
| ) -> None: | ||||
|     """Run a sequence of DVC commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands without the leading "dvc". | ||||
|     variables (Dict[str, str]): Dictionary of variable names, mapped to their | ||||
|         values. Will be used to substitute format string variables in the | ||||
|         commands. | ||||
|     flags (Dict[str, bool]): Conditional flags to be added to command. Makes it | ||||
|         easier to pass flags like --quiet that depend on a variable or | ||||
|         command-line setting while avoiding lots of nested conditionals. | ||||
|     """ | ||||
|     for command in commands: | ||||
|         # Substitute variables, e.g. "./{NAME}.json" | ||||
|         command = command.format(**variables) | ||||
|         command = split_command(command) | ||||
|         dvc_command = ["dvc", *command] | ||||
|         # Add the flags if they are set to True | ||||
|         for flag, is_active in flags.items(): | ||||
|             if is_active: | ||||
|                 dvc_command.append(flag) | ||||
|         run_command(dvc_command) | ||||
| 
 | ||||
| 
 | ||||
| def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: | ||||
|     """Validate workflows provided in project.yml and check that a given | ||||
|     workflow can be used to generate a DVC config. | ||||
| 
 | ||||
|     workflows (List[str]): Names of the available workflows. | ||||
|     workflow (Optional[str]): The name of the workflow to convert. | ||||
|     """ | ||||
|     if not workflows: | ||||
|         msg.fail( | ||||
|             f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " | ||||
|             f"define at least one list of commands.", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if workflow is not None and workflow not in workflows: | ||||
|         msg.fail( | ||||
|             f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " | ||||
|             f"Available workflows: {', '.join(workflows)}", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if not workflow: | ||||
|         msg.warn( | ||||
|             f"No workflow specified for DVC pipeline. Using the first workflow " | ||||
|             f"defined in {PROJECT_FILE}: '{workflows[0]}'" | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def ensure_dvc(project_dir: Path) -> None: | ||||
|     """Ensure that the "dvc" command is available and that the current project | ||||
|     directory is an initialized DVC project. | ||||
|     """ | ||||
|     try: | ||||
|         subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) | ||||
|     except Exception: | ||||
|         msg.fail( | ||||
|             "To use spaCy projects with DVC (Data Version Control), DVC needs " | ||||
|             "to be installed and the 'dvc' command needs to be available", | ||||
|             "You can install the Python package from pip (pip install dvc) or " | ||||
|             "conda (conda install -c conda-forge dvc). For more details, see the " | ||||
|             "documentation: https://dvc.org/doc/install", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if not (project_dir / ".dvc").exists(): | ||||
|         msg.fail( | ||||
|             "Project not initialized as a DVC project", | ||||
|             "To initialize a DVC project, you can run 'dvc init' in the project " | ||||
|             "directory. For more details, see the documentation: " | ||||
|             "https://dvc.org/doc/command-reference/init", | ||||
|             exits=1, | ||||
|         ) | ||||
							
								
								
									
										266
									
								
								spacy/cli/project/run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										266
									
								
								spacy/cli/project/run.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,266 @@ | |||
| from typing import Optional, List, Dict, Sequence, Any | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import sys | ||||
| import srsly | ||||
| 
 | ||||
| from ...util import working_dir, run_command, split_command, is_cwd, join_command | ||||
| from .._app import project_cli, Arg, Opt, COMMAND | ||||
| from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash | ||||
| from .util import get_checksum | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("run") | ||||
| def project_run_cli( | ||||
|     # fmt: off | ||||
|     subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), | ||||
|     dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"), | ||||
|     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Run a named command or workflow defined in the project.yml. If a workflow | ||||
|     name is specified, all commands in the workflow are run, in order. If | ||||
|     commands define dependencies and/or outputs, they will only be re-run if | ||||
|     state has changed. | ||||
|     """ | ||||
|     if show_help or not subcommand: | ||||
|         print_run_help(project_dir, subcommand) | ||||
|     else: | ||||
|         project_run(project_dir, subcommand, force=force, dry=dry) | ||||
| 
 | ||||
| 
 | ||||
| def project_run( | ||||
|     project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False | ||||
| ) -> None: | ||||
|     """Run a named script defined in the project.yml. If the script is part | ||||
|     of the default pipeline (defined in the "run" section), DVC is used to | ||||
|     execute the command, so it can determine whether to rerun it. It then | ||||
|     calls into "exec" to execute it. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     subcommand (str): Name of command to run. | ||||
|     force (bool): Force re-running, even if nothing changed. | ||||
|     dry (bool): Perform a dry run and don't execute commands. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     variables = config.get("variables", {}) | ||||
|     commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||
|     workflows = config.get("workflows", {}) | ||||
|     validate_subcommand(commands.keys(), workflows.keys(), subcommand) | ||||
|     if subcommand in workflows: | ||||
|         msg.info(f"Running workflow '{subcommand}'") | ||||
|         for cmd in workflows[subcommand]: | ||||
|             project_run(project_dir, cmd, force=force, dry=dry) | ||||
|     else: | ||||
|         cmd = commands[subcommand] | ||||
|         variables = config.get("variables", {}) | ||||
|         for dep in cmd.get("deps", []): | ||||
|             dep = dep.format(**variables) | ||||
|             if not (project_dir / dep).exists(): | ||||
|                 err = f"Missing dependency specified by command '{subcommand}': {dep}" | ||||
|                 err_kwargs = {"exits": 1} if not dry else {} | ||||
|                 msg.fail(err, **err_kwargs) | ||||
|         with working_dir(project_dir) as current_dir: | ||||
|             rerun = check_rerun(current_dir, cmd, variables) | ||||
|             if not rerun and not force: | ||||
|                 msg.info(f"Skipping '{cmd['name']}': nothing changed") | ||||
|             else: | ||||
|                 msg.divider(subcommand) | ||||
|                 run_commands(cmd["script"], variables, dry=dry) | ||||
|                 if not dry: | ||||
|                     update_lockfile(current_dir, cmd, variables) | ||||
| 
 | ||||
| 
 | ||||
| def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: | ||||
|     """Simulate a CLI help prompt using the info available in the project.yml. | ||||
| 
 | ||||
|     project_dir (Path): The project directory. | ||||
|     subcommand (Optional[str]): The subcommand or None. If a subcommand is | ||||
|         provided, the subcommand help is shown. Otherwise, the top-level help | ||||
|         and a list of available commands is printed. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     config_commands = config.get("commands", []) | ||||
|     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||
|     workflows = config.get("workflows", {}) | ||||
|     project_loc = "" if is_cwd(project_dir) else project_dir | ||||
|     if subcommand: | ||||
|         validate_subcommand(commands.keys(), workflows.keys(), subcommand) | ||||
|         print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") | ||||
|         if subcommand in commands: | ||||
|             help_text = commands[subcommand].get("help") | ||||
|             if help_text: | ||||
|                 print(f"\n{help_text}\n") | ||||
|         elif subcommand in workflows: | ||||
|             steps = workflows[subcommand] | ||||
|             print(f"\nWorkflow consisting of {len(steps)} commands:") | ||||
|             steps_data = [ | ||||
|                 (f"{i + 1}. {step}", commands[step].get("help", "")) | ||||
|                 for i, step in enumerate(steps) | ||||
|             ] | ||||
|             msg.table(steps_data) | ||||
|             help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help" | ||||
|             print(f"For command details, run: {help_cmd}") | ||||
|     else: | ||||
|         print("") | ||||
|         if config_commands: | ||||
|             print(f"Available commands in {PROJECT_FILE}") | ||||
|             print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") | ||||
|             msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) | ||||
|         if workflows: | ||||
|             print(f"Available workflows in {PROJECT_FILE}") | ||||
|             print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}") | ||||
|             msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()]) | ||||
| 
 | ||||
| 
 | ||||
| def run_commands( | ||||
|     commands: List[str] = tuple(), | ||||
|     variables: Dict[str, Any] = {}, | ||||
|     silent: bool = False, | ||||
|     dry: bool = False, | ||||
| ) -> None: | ||||
|     """Run a sequence of commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands. | ||||
|     variables (Dict[str, Any]): Dictionary of variable names, mapped to their | ||||
|         values. Will be used to substitute format string variables in the | ||||
|         commands. | ||||
|     silent (bool): Don't print the commands. | ||||
|     dry (bool): Perform a dry run and don't execut anything. | ||||
|     """ | ||||
|     for command in commands: | ||||
|         # Substitute variables, e.g. "./{NAME}.json" | ||||
|         command = command.format(**variables) | ||||
|         command = split_command(command) | ||||
|         # Not sure if this is needed or a good idea. Motivation: users may often | ||||
|         # use commands in their config that reference "python" and we want to | ||||
|         # make sure that it's always executing the same Python that spaCy is | ||||
|         # executed with and the pip in the same env, not some other Python/pip. | ||||
|         # Also ensures cross-compatibility if user 1 writes "python3" (because | ||||
|         # that's how it's set up on their system), and user 2 without the | ||||
|         # shortcut tries to re-run the command. | ||||
|         if len(command) and command[0] in ("python", "python3"): | ||||
|             command[0] = sys.executable | ||||
|         elif len(command) and command[0] in ("pip", "pip3"): | ||||
|             command = [sys.executable, "-m", "pip", *command[1:]] | ||||
|         if not silent: | ||||
|             print(f"Running command: {join_command(command)}") | ||||
|         if not dry: | ||||
|             run_command(command) | ||||
| 
 | ||||
| 
 | ||||
| def validate_subcommand( | ||||
|     commands: Sequence[str], workflows: Sequence[str], subcommand: str | ||||
| ) -> None: | ||||
|     """Check that a subcommand is valid and defined. Raises an error otherwise. | ||||
| 
 | ||||
|     commands (Sequence[str]): The available commands. | ||||
|     subcommand (str): The subcommand. | ||||
|     """ | ||||
|     if not commands and not workflows: | ||||
|         msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1) | ||||
|     if subcommand not in commands and subcommand not in workflows: | ||||
|         help_msg = [] | ||||
|         if commands: | ||||
|             help_msg.append(f"Available commands: {', '.join(commands)}") | ||||
|         if workflows: | ||||
|             help_msg.append(f"Available workflows: {', '.join(workflows)}") | ||||
|         msg.fail( | ||||
|             f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}", | ||||
|             ". ".join(help_msg), | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def check_rerun( | ||||
|     project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] | ||||
| ) -> bool: | ||||
|     """Check if a command should be rerun because its settings or inputs/outputs | ||||
|     changed. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     command (Dict[str, Any]): The command, as defined in the project.yml. | ||||
|     variables (Dict[str, Any]): The variables defined in the project.yml. | ||||
|     RETURNS (bool): Whether to re-run the command. | ||||
|     """ | ||||
|     lock_path = project_dir / PROJECT_LOCK | ||||
|     if not lock_path.exists():  # We don't have a lockfile, run command | ||||
|         return True | ||||
|     data = srsly.read_yaml(lock_path) | ||||
|     if command["name"] not in data:  # We don't have info about this command | ||||
|         return True | ||||
|     entry = data[command["name"]] | ||||
|     # Always run commands with no outputs (otherwise they'd always be skipped) | ||||
|     if not entry.get("outs", []): | ||||
|         return True | ||||
|     # If the entry in the lockfile matches the lockfile entry that would be | ||||
|     # generated from the current command, we don't rerun because it means that | ||||
|     # all inputs/outputs, hashes and scripts are the same and nothing changed | ||||
|     return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry) | ||||
| 
 | ||||
| 
 | ||||
| def update_lockfile( | ||||
|     project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] | ||||
| ) -> None: | ||||
|     """Update the lockfile after running a command. Will create a lockfile if | ||||
|     it doesn't yet exist and will add an entry for the current command, its | ||||
|     script and dependencies/outputs. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     command (Dict[str, Any]): The command, as defined in the project.yml. | ||||
|     variables (Dict[str, Any]): The variables defined in the project.yml. | ||||
|     """ | ||||
|     lock_path = project_dir / PROJECT_LOCK | ||||
|     if not lock_path.exists(): | ||||
|         srsly.write_yaml(lock_path, {}) | ||||
|         data = {} | ||||
|     else: | ||||
|         data = srsly.read_yaml(lock_path) | ||||
|     data[command["name"]] = get_lock_entry(project_dir, command, variables) | ||||
|     srsly.write_yaml(lock_path, data) | ||||
| 
 | ||||
| 
 | ||||
| def get_lock_entry( | ||||
|     project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] | ||||
| ) -> Dict[str, Any]: | ||||
|     """Get a lockfile entry for a given command. An entry includes the command, | ||||
|     the script (command steps) and a list of dependencies and outputs with | ||||
|     their paths and file hashes, if available. The format is based on the | ||||
|     dvc.lock files, to keep things consistent. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     command (Dict[str, Any]): The command, as defined in the project.yml. | ||||
|     variables (Dict[str, Any]): The variables defined in the project.yml. | ||||
|     RETURNS (Dict[str, Any]): The lockfile entry. | ||||
|     """ | ||||
|     deps = get_fileinfo(project_dir, command.get("deps", []), variables) | ||||
|     outs = get_fileinfo(project_dir, command.get("outputs", []), variables) | ||||
|     outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables) | ||||
|     return { | ||||
|         "cmd": f"{COMMAND} run {command['name']}", | ||||
|         "script": command["script"], | ||||
|         "deps": deps, | ||||
|         "outs": [*outs, *outs_nc], | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def get_fileinfo( | ||||
|     project_dir: Path, paths: List[str], variables: Dict[str, Any] | ||||
| ) -> List[Dict[str, str]]: | ||||
|     """Generate the file information for a list of paths (dependencies, outputs). | ||||
|     Includes the file path and the file's checksum. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     paths (List[str]): The file paths. | ||||
|     variables (Dict[str, Any]): The variables defined in the project.yml. | ||||
|     RETURNS (List[Dict[str, str]]): The lockfile entry for a file. | ||||
|     """ | ||||
|     data = [] | ||||
|     for path in paths: | ||||
|         path = path.format(**variables) | ||||
|         file_path = project_dir / path | ||||
|         md5 = get_checksum(file_path) if file_path.exists() else None | ||||
|         data.append({"path": path, "md5": md5}) | ||||
|     return data | ||||
							
								
								
									
										93
									
								
								spacy/cli/project/util.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								spacy/cli/project/util.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,93 @@ | |||
| from typing import Dict, Any, Union | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import srsly | ||||
| import hashlib | ||||
| 
 | ||||
| from ...schemas import ProjectConfigSchema, validate | ||||
| 
 | ||||
| 
 | ||||
| PROJECT_FILE = "project.yml" | ||||
| PROJECT_LOCK = "project.lock" | ||||
| 
 | ||||
| 
 | ||||
| def load_project_config(path: Path) -> Dict[str, Any]: | ||||
|     """Load the project.yml file from a directory and validate it. Also make | ||||
|     sure that all directories defined in the config exist. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     RETURNS (Dict[str, Any]): The loaded project.yml. | ||||
|     """ | ||||
|     config_path = path / PROJECT_FILE | ||||
|     if not config_path.exists(): | ||||
|         msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) | ||||
|     invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." | ||||
|     try: | ||||
|         config = srsly.read_yaml(config_path) | ||||
|     except ValueError as e: | ||||
|         msg.fail(invalid_err, e, exits=1) | ||||
|     errors = validate(ProjectConfigSchema, config) | ||||
|     if errors: | ||||
|         msg.fail(invalid_err, "\n".join(errors), exits=1) | ||||
|     validate_project_commands(config) | ||||
|     # Make sure directories defined in config exist | ||||
|     for subdir in config.get("directories", []): | ||||
|         dir_path = path / subdir | ||||
|         if not dir_path.exists(): | ||||
|             dir_path.mkdir(parents=True) | ||||
|     return config | ||||
| 
 | ||||
| 
 | ||||
| def validate_project_commands(config: Dict[str, Any]) -> None: | ||||
|     """Check that project commands and workflows are valid, don't contain | ||||
|     duplicates, don't clash  and only refer to commands that exist. | ||||
| 
 | ||||
|     config (Dict[str, Any]): The loaded config. | ||||
|     """ | ||||
|     command_names = [cmd["name"] for cmd in config.get("commands", [])] | ||||
|     workflows = config.get("workflows", {}) | ||||
|     duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) | ||||
|     if duplicates: | ||||
|         err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" | ||||
|         msg.fail(err, exits=1) | ||||
|     for workflow_name, workflow_steps in workflows.items(): | ||||
|         if workflow_name in command_names: | ||||
|             err = f"Can't use workflow name '{workflow_name}': name already exists as a command" | ||||
|             msg.fail(err, exits=1) | ||||
|         for step in workflow_steps: | ||||
|             if step not in command_names: | ||||
|                 msg.fail( | ||||
|                     f"Unknown command specified in workflow '{workflow_name}': {step}", | ||||
|                     f"Workflows can only refer to commands defined in the 'commands' " | ||||
|                     f"section of the {PROJECT_FILE}.", | ||||
|                     exits=1, | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
| def get_hash(data) -> str: | ||||
|     """Get the hash for a JSON-serializable object. | ||||
| 
 | ||||
|     data: The data to hash. | ||||
|     RETURNS (str): The hash. | ||||
|     """ | ||||
|     data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") | ||||
|     return hashlib.md5(data_str).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| def get_checksum(path: Union[Path, str]) -> str: | ||||
|     """Get the checksum for a file or directory given its file path. If a | ||||
|     directory path is provided, this uses all files in that directory. | ||||
| 
 | ||||
|     path (Union[Path, str]): The file or directory path. | ||||
|     RETURNS (str): The checksum. | ||||
|     """ | ||||
|     path = Path(path) | ||||
|     if path.is_file(): | ||||
|         return hashlib.md5(Path(path).read_bytes()).hexdigest() | ||||
|     if path.is_dir(): | ||||
|         # TODO: this is currently pretty slow | ||||
|         dir_checksum = hashlib.md5() | ||||
|         for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): | ||||
|             dir_checksum.update(sub_file.read_bytes()) | ||||
|         return dir_checksum.hexdigest() | ||||
|     raise ValueError(f"Can't get checksum for {path}: not a file or directory") | ||||
|  | @ -121,14 +121,14 @@ class ConfigSchema(BaseModel): | |||
| @app.command("train") | ||||
| def train_cli( | ||||
|     # fmt: off | ||||
|     train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), | ||||
|     dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), | ||||
|     train_path: Path = Arg(..., help="Location of training data", exists=True), | ||||
|     dev_path: Path = Arg(..., help="Location of development data", exists=True), | ||||
|     config_path: Path = Arg(..., help="Path to config file", exists=True), | ||||
|     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), | ||||
|     code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), | ||||
|     raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), | ||||
|     verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||
|     use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), | ||||
|     num_workers: int = Opt(None, "-j", help="Parallel Workers"), | ||||
|     strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"), | ||||
|  | @ -155,6 +155,7 @@ def train_cli( | |||
|     if init_tok2vec is not None: | ||||
|         with init_tok2vec.open("rb") as file_: | ||||
|             weights_data = file_.read() | ||||
| 
 | ||||
|     train_args = dict( | ||||
|         config_path=config_path, | ||||
|         data_paths={"train": train_path, "dev": dev_path}, | ||||
|  | @ -170,7 +171,7 @@ def train_cli( | |||
|         distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args) | ||||
|     else: | ||||
|         if use_gpu >= 0: | ||||
|             msg.info(f"Using GPU: {str(use_gpu)}") | ||||
|             msg.info(f"Using GPU: {use_gpu}") | ||||
|             require_gpu(use_gpu) | ||||
|         else: | ||||
|             msg.info("Using CPU") | ||||
|  | @ -191,7 +192,8 @@ def train( | |||
|     msg.info(f"Loading config from: {config_path}") | ||||
|     # Read the config first without creating objects, to get to the original nlp_config | ||||
|     config = util.load_config(config_path, create_objects=False) | ||||
|     fix_random_seed(config["training"]["seed"]) | ||||
|     if config["training"].get("seed"): | ||||
|         fix_random_seed(config["training"]["seed"]) | ||||
|     if config["training"].get("use_pytorch_for_gpu_memory"): | ||||
|         # It feels kind of weird to not have a default for this. | ||||
|         use_pytorch_for_gpu_memory() | ||||
|  | @ -216,7 +218,10 @@ def train( | |||
|         msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") | ||||
|         train_examples = list( | ||||
|             corpus.train_dataset( | ||||
|                 nlp, shuffle=False, gold_preproc=training["gold_preproc"] | ||||
|                 nlp, | ||||
|                 shuffle=False, | ||||
|                 gold_preproc=training["gold_preproc"], | ||||
|                 max_length=training["max_length"], | ||||
|             ) | ||||
|         ) | ||||
|         nlp.begin_training(lambda: train_examples) | ||||
|  | @ -315,6 +320,7 @@ def create_train_batches(nlp, corpus, cfg, randomization_index): | |||
|     ) | ||||
| 
 | ||||
|     epoch = 0 | ||||
|     batch_strategy = cfg.get("batch_by", "sequences") | ||||
|     while True: | ||||
|         if len(train_examples) == 0: | ||||
|             raise ValueError(Errors.E988) | ||||
|  | @ -324,11 +330,22 @@ def create_train_batches(nlp, corpus, cfg, randomization_index): | |||
|             random.random() | ||||
|         random.shuffle(train_examples) | ||||
|         epoch += 1 | ||||
|         batches = util.minibatch_by_words( | ||||
|             train_examples, | ||||
|             size=cfg["batch_size"], | ||||
|             discard_oversize=cfg["discard_oversize"], | ||||
|         ) | ||||
|         if batch_strategy == "padded": | ||||
|             batches = util.minibatch_by_padded_size( | ||||
|                 train_examples, | ||||
|                 size=cfg["batch_size"], | ||||
|                 buffer=256, | ||||
|                 discard_oversize=cfg["discard_oversize"], | ||||
|             ) | ||||
|         elif batch_strategy == "words": | ||||
|             batches = util.minibatch_by_words( | ||||
|                 train_examples, | ||||
|                 size=cfg["batch_size"], | ||||
|                 discard_oversize=cfg["discard_oversize"], | ||||
|             ) | ||||
|         else: | ||||
|             batches = util.minibatch(train_examples, size=cfg["batch_size"]) | ||||
| 
 | ||||
|         # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop | ||||
|         try: | ||||
|             first = next(batches) | ||||
|  | @ -440,7 +457,9 @@ def train_while_improving( | |||
| 
 | ||||
|     if raw_text: | ||||
|         random.shuffle(raw_text) | ||||
|         raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text] | ||||
|         raw_examples = [ | ||||
|             Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text | ||||
|         ] | ||||
|         raw_batches = util.minibatch(raw_examples, size=8) | ||||
| 
 | ||||
|     for step, (epoch, batch) in enumerate(train_data): | ||||
|  |  | |||
|  | @ -69,6 +69,9 @@ class Warnings(object): | |||
|     W027 = ("Found a large training file of {size} bytes. Note that it may " | ||||
|             "be more efficient to split your training data into multiple " | ||||
|             "smaller JSON files instead.") | ||||
|     W028 = ("Doc.from_array was called with a vector of type '{type}', " | ||||
|             "but is expecting one of type 'uint64' instead. This may result " | ||||
|             "in problems with the vocab further on in the pipeline.") | ||||
|     W030 = ("Some entities could not be aligned in the text \"{text}\" with " | ||||
|             "entities \"{entities}\". Use " | ||||
|             "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" | ||||
|  | @ -477,15 +480,14 @@ class Errors(object): | |||
|     E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") | ||||
| 
 | ||||
|     # TODO: fix numbering after merging develop into master | ||||
|     E969 = ("Expected string values for field '{field}', but received {types} instead. ") | ||||
|     E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") | ||||
|     E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " | ||||
|             "array and {doc_length} for the Doc itself.") | ||||
|     E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") | ||||
|     E973 = ("Unexpected type for NER data") | ||||
|     E974 = ("Unknown {obj} attribute: {key}") | ||||
|     E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " | ||||
|             "but got {type}") | ||||
|     E976 = ("The method 'Example.from_dict' expects a dict as second argument, " | ||||
|     E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, " | ||||
|             "but received None.") | ||||
|     E977 = ("Can not compare a MorphAnalysis with a string object. " | ||||
|             "This is likely a bug in spaCy, so feel free to open an issue.") | ||||
|  |  | |||
|  | @ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): | |||
| 
 | ||||
| cdef class Example: | ||||
|     def __init__(self, Doc predicted, Doc reference, *, alignment=None): | ||||
|         """ Doc can either be text, or an actual Doc """ | ||||
|         if predicted is None: | ||||
|             raise TypeError(Errors.E972.format(arg="predicted")) | ||||
|         if reference is None: | ||||
|  | @ -37,6 +36,9 @@ cdef class Example: | |||
|         self.y = reference | ||||
|         self._alignment = alignment | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return len(self.predicted) | ||||
| 
 | ||||
|     property predicted: | ||||
|         def __get__(self): | ||||
|             return self.x | ||||
|  | @ -59,17 +61,15 @@ cdef class Example: | |||
| 
 | ||||
|     @classmethod | ||||
|     def from_dict(cls, Doc predicted, dict example_dict): | ||||
|         if predicted is None: | ||||
|             raise ValueError(Errors.E976.format(n="first", type="Doc")) | ||||
|         if example_dict is None: | ||||
|             raise ValueError(Errors.E976) | ||||
|         if not isinstance(predicted, Doc): | ||||
|             raise TypeError(Errors.E975.format(type=type(predicted))) | ||||
|             raise ValueError(Errors.E976.format(n="second", type="dict")) | ||||
|         example_dict = _fix_legacy_dict_data(example_dict) | ||||
|         tok_dict, doc_dict = _parse_example_dict_data(example_dict) | ||||
|         if "ORTH" not in tok_dict: | ||||
|             tok_dict["ORTH"] = [tok.text for tok in predicted] | ||||
|             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] | ||||
|         if not _has_field(tok_dict, "SPACY"): | ||||
|             spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) | ||||
|         return Example( | ||||
|             predicted, | ||||
|             annotations2doc(predicted.vocab, tok_dict, doc_dict) | ||||
|  | @ -257,7 +257,11 @@ def _annot2array(vocab, tok_annot, doc_annot): | |||
|             values.append([vocab.morphology.add(v) for v in value]) | ||||
|         else: | ||||
|             attrs.append(key) | ||||
|             values.append([vocab.strings.add(v) for v in value]) | ||||
|             try: | ||||
|                 values.append([vocab.strings.add(v) for v in value]) | ||||
|             except TypeError: | ||||
|                 types= set([type(v) for v in value]) | ||||
|                 raise TypeError(Errors.E969.format(field=key, types=types)) | ||||
| 
 | ||||
|     array = numpy.asarray(values, dtype="uint64") | ||||
|     return attrs, array.T | ||||
|  | @ -325,8 +329,8 @@ def _fix_legacy_dict_data(example_dict): | |||
|     for key, value in old_token_dict.items(): | ||||
|         if key in ("text", "ids", "brackets"): | ||||
|             pass | ||||
|         elif key in remapping: | ||||
|             token_dict[remapping[key]] = value | ||||
|         elif key.lower() in remapping: | ||||
|             token_dict[remapping[key.lower()]] = value | ||||
|         else: | ||||
|             raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) | ||||
|     text = example_dict.get("text", example_dict.get("raw")) | ||||
|  |  | |||
|  | @ -513,20 +513,23 @@ class Language(object): | |||
|     ): | ||||
|         """Update the models in the pipeline. | ||||
| 
 | ||||
|         examples (iterable): A batch of `Example` objects. | ||||
|         examples (Iterable[Example]): A batch of examples | ||||
|         dummy: Should not be set - serves to catch backwards-incompatible scripts. | ||||
|         drop (float): The dropout rate. | ||||
|         sgd (callable): An optimizer. | ||||
|         losses (dict): Dictionary to update with the loss, keyed by component. | ||||
|         component_cfg (dict): Config parameters for specific pipeline | ||||
|         sgd (Optimizer): An optimizer. | ||||
|         losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. | ||||
|         component_cfg (Dict[str, Dict]): Config parameters for specific pipeline | ||||
|             components, keyed by component name. | ||||
|         RETURNS (Dict[str, float]): The updated losses dictionary | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/language#update | ||||
|         """ | ||||
|         if dummy is not None: | ||||
|             raise ValueError(Errors.E989) | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         if len(examples) == 0: | ||||
|             return | ||||
|             return losses | ||||
|         if not isinstance(examples, Iterable): | ||||
|             raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples))) | ||||
|         wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) | ||||
|  | @ -540,22 +543,19 @@ class Language(object): | |||
| 
 | ||||
|         if component_cfg is None: | ||||
|             component_cfg = {} | ||||
|         component_deps = count_pipeline_interdependencies(self.pipeline) | ||||
|         # Determine whether component should set annotations. In theory I guess | ||||
|         # we should do this by inspecting the meta? Or we could just always | ||||
|         # say "yes" | ||||
|         for i, (name, proc) in enumerate(self.pipeline): | ||||
|             component_cfg.setdefault(name, {}) | ||||
|             component_cfg[name].setdefault("drop", drop) | ||||
|             component_cfg[name]["set_annotations"] = bool(component_deps[i]) | ||||
|             component_cfg[name].setdefault("set_annotations", False) | ||||
|         for name, proc in self.pipeline: | ||||
|             if not hasattr(proc, "update"): | ||||
|                 continue | ||||
|             proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) | ||||
|         if sgd is not False: | ||||
|         if sgd not in (None, False): | ||||
|             for name, proc in self.pipeline: | ||||
|                 if hasattr(proc, "model"): | ||||
|                     proc.model.finish_update(sgd) | ||||
|         return losses | ||||
| 
 | ||||
|     def rehearse(self, examples, sgd=None, losses=None, config=None): | ||||
|         """Make a "rehearsal" update to the models in the pipeline, to prevent | ||||
|  | @ -761,18 +761,17 @@ class Language(object): | |||
|     ): | ||||
|         """Process texts as a stream, and yield `Doc` objects in order. | ||||
| 
 | ||||
|         texts (iterator): A sequence of texts to process. | ||||
|         texts (Iterable[str]): A sequence of texts to process. | ||||
|         as_tuples (bool): If set to True, inputs should be a sequence of | ||||
|             (text, context) tuples. Output will then be a sequence of | ||||
|             (doc, context) tuples. Defaults to False. | ||||
|         batch_size (int): The number of texts to buffer. | ||||
|         disable (list): Names of the pipeline components to disable. | ||||
|         disable (List[str]): Names of the pipeline components to disable. | ||||
|         cleanup (bool): If True, unneeded strings are freed to control memory | ||||
|             use. Experimental. | ||||
|         component_cfg (dict): An optional dictionary with extra keyword | ||||
|         component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword | ||||
|             arguments for specific components. | ||||
|         n_process (int): Number of processors to process texts, only supported | ||||
|             in Python3. If -1, set `multiprocessing.cpu_count()`. | ||||
|         n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`. | ||||
|         YIELDS (Doc): Documents in the order of the original text. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/language#pipe | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| from thinc.api import Model, normal_init | ||||
| 
 | ||||
| 
 | ||||
| def PrecomputableAffine(nO, nI, nF, nP): | ||||
| def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): | ||||
|     model = Model( | ||||
|         "precomputable_affine", | ||||
|         forward, | ||||
|         init=init, | ||||
|         dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, | ||||
|         params={"W": None, "b": None, "pad": None}, | ||||
|         attrs={"dropout_rate": dropout} | ||||
|     ) | ||||
|     return model | ||||
| 
 | ||||
|  | @ -48,17 +49,14 @@ def forward(model, X, is_train): | |||
|         model.inc_grad("b", dY.sum(axis=0)) | ||||
|         dY = dY.reshape((dY.shape[0], nO * nP)) | ||||
| 
 | ||||
|         Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3))) | ||||
|         Wopfi = W.transpose((1, 2, 0, 3)) | ||||
|         Wopfi = Wopfi.reshape((nO * nP, nF * nI)) | ||||
|         dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) | ||||
| 
 | ||||
|         # Reuse the buffer | ||||
|         dWopfi = Wopfi | ||||
|         dWopfi.fill(0.0) | ||||
|         model.ops.gemm(dY, Xf, out=dWopfi, trans1=True) | ||||
|         dWopfi = model.ops.gemm(dY, Xf, trans1=True) | ||||
|         dWopfi = dWopfi.reshape((nO, nP, nF, nI)) | ||||
|         # (o, p, f, i) --> (f, o, p, i) | ||||
|         dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3))) | ||||
|         dWopfi = dWopfi.transpose((2, 0, 1, 3)) | ||||
|         model.inc_grad("W", dWopfi) | ||||
|         return dXf.reshape((dXf.shape[0], nF, nI)) | ||||
| 
 | ||||
|  |  | |||
|  | @ -87,16 +87,16 @@ def build_text_classifier( | |||
|     cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): | ||||
|         lower = HashEmbed( | ||||
|             nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout | ||||
|             nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10 | ||||
|         ) | ||||
|         prefix = HashEmbed( | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11 | ||||
|         ) | ||||
|         suffix = HashEmbed( | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12 | ||||
|         ) | ||||
|         shape = HashEmbed( | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13 | ||||
|         ) | ||||
| 
 | ||||
|         width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) | ||||
|  |  | |||
|  | @ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces): | |||
| def MultiHashEmbed( | ||||
|     columns, width, rows, use_subwords, pretrained_vectors, mix, dropout | ||||
| ): | ||||
|     norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) | ||||
|     norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6) | ||||
|     if use_subwords: | ||||
|         prefix = HashEmbed( | ||||
|             nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout | ||||
|             nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7 | ||||
|         ) | ||||
|         suffix = HashEmbed( | ||||
|             nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout | ||||
|             nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8 | ||||
|         ) | ||||
|         shape = HashEmbed( | ||||
|             nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout | ||||
|             nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9 | ||||
|         ) | ||||
| 
 | ||||
|     if pretrained_vectors: | ||||
|  | @ -192,7 +192,7 @@ def MultiHashEmbed( | |||
| 
 | ||||
| @registry.architectures.register("spacy.CharacterEmbed.v1") | ||||
| def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): | ||||
|     norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) | ||||
|     norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5) | ||||
|     chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | ||||
|     with Model.define_operators({">>": chain, "|": concatenate}): | ||||
|         embed_layer = chr_embed | features >> with_array(norm) | ||||
|  | @ -263,20 +263,20 @@ def build_Tok2Vec_model( | |||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): | ||||
|         norm = HashEmbed( | ||||
|             nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout, | ||||
|             nO=width, nV=embed_size, column=cols.index(NORM), dropout=None, | ||||
|             seed=0 | ||||
|         ) | ||||
|         if subword_features: | ||||
|             prefix = HashEmbed( | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout, | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None, | ||||
|                 seed=1 | ||||
|             ) | ||||
|             suffix = HashEmbed( | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout, | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None, | ||||
|                 seed=2 | ||||
|             ) | ||||
|             shape = HashEmbed( | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout, | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None, | ||||
|                 seed=3 | ||||
|             ) | ||||
|         else: | ||||
|  | @ -296,7 +296,7 @@ def build_Tok2Vec_model( | |||
|                     >> Maxout( | ||||
|                         nO=width, | ||||
|                         nI=width * columns, | ||||
|                         nP=maxout_pieces, | ||||
|                         nP=3, | ||||
|                         dropout=0.0, | ||||
|                         normalize=True, | ||||
|                     ), | ||||
|  | @ -309,7 +309,7 @@ def build_Tok2Vec_model( | |||
|                     >> Maxout( | ||||
|                         nO=width, | ||||
|                         nI=width * columns, | ||||
|                         nP=maxout_pieces, | ||||
|                         nP=3, | ||||
|                         dropout=0.0, | ||||
|                         normalize=True, | ||||
|                     ), | ||||
|  | @ -322,7 +322,7 @@ def build_Tok2Vec_model( | |||
|                 >> Maxout( | ||||
|                     nO=width, | ||||
|                     nI=width * columns, | ||||
|                     nP=maxout_pieces, | ||||
|                     nP=3, | ||||
|                     dropout=0.0, | ||||
|                     normalize=True, | ||||
|                 ), | ||||
|  | @ -335,7 +335,7 @@ def build_Tok2Vec_model( | |||
|             reduce_dimensions = Maxout( | ||||
|                 nO=width, | ||||
|                 nI=nM * nC + width, | ||||
|                 nP=maxout_pieces, | ||||
|                 nP=3, | ||||
|                 dropout=0.0, | ||||
|                 normalize=True, | ||||
|             ) | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear | |||
| from ..syntax._parser_model import ParserStepModel | ||||
| 
 | ||||
| 
 | ||||
| def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): | ||||
| def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()): | ||||
|     """Set up a stepwise transition-based model""" | ||||
|     if upper is None: | ||||
|         has_upper = False | ||||
|  |  | |||
|  | @ -272,7 +272,7 @@ cdef class Morphology: | |||
| 
 | ||||
|     @staticmethod | ||||
|     def feats_to_dict(feats): | ||||
|         if not feats: | ||||
|         if not feats or feats == Morphology.EMPTY_MORPH: | ||||
|             return {} | ||||
|         return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in | ||||
|                 [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ cimport numpy as np | |||
| 
 | ||||
| import numpy | ||||
| import srsly | ||||
| from thinc.api import to_categorical | ||||
| from thinc.api import SequenceCategoricalCrossentropy | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| from ..vocab cimport Vocab | ||||
|  | @ -85,13 +85,10 @@ class Morphologizer(Tagger): | |||
|             doc.is_morphed = True | ||||
| 
 | ||||
|     def get_loss(self, examples, scores): | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         tag_index = {tag: i for i, tag in enumerate(self.labels)} | ||||
|         cdef int idx = 0 | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype="i") | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         known_labels = numpy.ones((scores.shape[0], 1), dtype="f") | ||||
|         loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) | ||||
|         truths = [] | ||||
|         for eg in examples: | ||||
|             eg_truths = [] | ||||
|             pos_tags = eg.get_aligned("POS", as_string=True) | ||||
|             morphs = eg.get_aligned("MORPH", as_string=True) | ||||
|             for i in range(len(morphs)): | ||||
|  | @ -104,20 +101,11 @@ class Morphologizer(Tagger): | |||
|                     morph = self.vocab.strings[self.vocab.morphology.add(feats)] | ||||
|                 if morph == "": | ||||
|                     morph = Morphology.EMPTY_MORPH | ||||
|                 if morph is None: | ||||
|                     correct[idx] = guesses[idx] | ||||
|                 elif morph in tag_index: | ||||
|                     correct[idx] = tag_index[morph] | ||||
|                 else: | ||||
|                     correct[idx] = 0 | ||||
|                     known_labels[idx] = 0. | ||||
|                 idx += 1 | ||||
|         correct = self.model.ops.xp.array(correct, dtype="i") | ||||
|         d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) | ||||
|         d_scores *= self.model.ops.asarray(known_labels) | ||||
|         loss = (d_scores**2).sum() | ||||
|         docs = [eg.predicted for eg in examples] | ||||
|         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) | ||||
|                 eg_truths.append(morph) | ||||
|             truths.append(eg_truths) | ||||
|         d_scores, loss = loss_func(scores, truths) | ||||
|         if self.model.ops.xp.isnan(loss): | ||||
|             raise ValueError("nan value when computing loss") | ||||
|         return float(loss), d_scores | ||||
| 
 | ||||
|     def to_bytes(self, exclude=tuple()): | ||||
|  |  | |||
|  | @ -58,12 +58,8 @@ class Pipe(object): | |||
|         Both __call__ and pipe should delegate to the `predict()` | ||||
|         and `set_annotations()` methods. | ||||
|         """ | ||||
|         predictions = self.predict([doc]) | ||||
|         if isinstance(predictions, tuple) and len(predictions) == 2: | ||||
|             scores, tensors = predictions | ||||
|             self.set_annotations([doc], scores, tensors=tensors) | ||||
|         else: | ||||
|             self.set_annotations([doc], predictions) | ||||
|         scores = self.predict([doc]) | ||||
|         self.set_annotations([doc], scores) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128): | ||||
|  | @ -73,12 +69,8 @@ class Pipe(object): | |||
|         and `set_annotations()` methods. | ||||
|         """ | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             predictions = self.predict(docs) | ||||
|             if isinstance(predictions, tuple) and len(tuple) == 2: | ||||
|                 scores, tensors = predictions | ||||
|                 self.set_annotations(docs, scores, tensors=tensors) | ||||
|             else: | ||||
|                 self.set_annotations(docs, predictions) | ||||
|             scores = self.predict(docs) | ||||
|             self.set_annotations(docs, scores) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|  | @ -87,7 +79,7 @@ class Pipe(object): | |||
|         """ | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def set_annotations(self, docs, scores, tensors=None): | ||||
|     def set_annotations(self, docs, scores): | ||||
|         """Modify a batch of documents, using pre-computed scores.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|  | @ -281,9 +273,10 @@ class Tagger(Pipe): | |||
|                 idx += 1 | ||||
|             doc.is_tagged = True | ||||
| 
 | ||||
|     def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|     def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         losses.setdefault(self.name, 0.0) | ||||
| 
 | ||||
|         try: | ||||
|             if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): | ||||
|  | @ -303,11 +296,11 @@ class Tagger(Pipe): | |||
|         if sgd not in (None, False): | ||||
|             self.model.finish_update(sgd) | ||||
| 
 | ||||
|         if losses is not None: | ||||
|             losses[self.name] += loss | ||||
|         losses[self.name] += loss | ||||
|         if set_annotations: | ||||
|             docs = [eg.predicted for eg in examples] | ||||
|             self.set_annotations(docs, self._scores2guesses(tag_scores)) | ||||
|         return losses | ||||
| 
 | ||||
|     def rehearse(self, examples, drop=0., sgd=None, losses=None): | ||||
|         """Perform a 'rehearsal' update, where we try to match the output of | ||||
|  | @ -334,7 +327,7 @@ class Tagger(Pipe): | |||
|             losses[self.name] += (gradient**2).sum() | ||||
| 
 | ||||
|     def get_loss(self, examples, scores): | ||||
|         loss_func = SequenceCategoricalCrossentropy(names=self.labels) | ||||
|         loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) | ||||
|         truths = [eg.get_aligned("tag", as_string=True) for eg in examples] | ||||
|         d_scores, loss = loss_func(scores, truths) | ||||
|         if self.model.ops.xp.isnan(loss): | ||||
|  | @ -521,29 +514,23 @@ class SentenceRecognizer(Tagger): | |||
|                         doc.c[j].sent_start = -1 | ||||
| 
 | ||||
|     def get_loss(self, examples, scores): | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         tag_index = range(len(self.labels)) | ||||
|         cdef int idx = 0 | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype="i") | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         known_labels = numpy.ones((scores.shape[0], 1), dtype="f") | ||||
|         labels = self.labels | ||||
|         loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) | ||||
|         truths = [] | ||||
|         for eg in examples: | ||||
|             sent_starts = eg.get_aligned("sent_start") | ||||
|             for sent_start in sent_starts: | ||||
|                 if sent_start is None: | ||||
|                     correct[idx] = guesses[idx] | ||||
|                 elif sent_start in tag_index: | ||||
|                     correct[idx] = sent_start | ||||
|             eg_truth = [] | ||||
|             for x in eg.get_aligned("sent_start"): | ||||
|                 if x == None: | ||||
|                     eg_truth.append(None) | ||||
|                 elif x == 1: | ||||
|                     eg_truth.append(labels[1]) | ||||
|                 else: | ||||
|                     correct[idx] = 0 | ||||
|                     known_labels[idx] = 0. | ||||
|                 idx += 1 | ||||
|         correct = self.model.ops.xp.array(correct, dtype="i") | ||||
|         d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) | ||||
|         d_scores *= self.model.ops.asarray(known_labels) | ||||
|         loss = (d_scores**2).sum() | ||||
|         docs = [eg.predicted for eg in examples] | ||||
|         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) | ||||
|                     # anything other than 1: 0, -1, -1 as uint64 | ||||
|                     eg_truth.append(labels[0]) | ||||
|             truths.append(eg_truth) | ||||
|         d_scores, loss = loss_func(scores, truths) | ||||
|         if self.model.ops.xp.isnan(loss): | ||||
|             raise ValueError("nan value when computing loss") | ||||
|         return float(loss), d_scores | ||||
| 
 | ||||
|     def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, | ||||
|  | @ -641,7 +628,7 @@ class MultitaskObjective(Tagger): | |||
|     def labels(self, value): | ||||
|         self.cfg["labels"] = value | ||||
| 
 | ||||
|     def set_annotations(self, docs, dep_ids, tensors=None): | ||||
|     def set_annotations(self, docs, dep_ids): | ||||
|         pass | ||||
| 
 | ||||
|     def begin_training(self, get_examples=lambda: [], pipeline=None, | ||||
|  | @ -738,7 +725,7 @@ class ClozeMultitask(Pipe): | |||
|         self.cfg = cfg | ||||
|         self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config | ||||
| 
 | ||||
|     def set_annotations(self, docs, dep_ids, tensors=None): | ||||
|     def set_annotations(self, docs, dep_ids): | ||||
|         pass | ||||
| 
 | ||||
|     def begin_training(self, get_examples=lambda: [], pipeline=None, | ||||
|  | @ -767,7 +754,7 @@ class ClozeMultitask(Pipe): | |||
|         loss = self.distance.get_loss(prediction, target) | ||||
|         return loss, gradient | ||||
| 
 | ||||
|     def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): | ||||
|     def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): | ||||
|         pass | ||||
| 
 | ||||
|     def rehearse(self, examples, drop=0., sgd=None, losses=None): | ||||
|  | @ -815,8 +802,8 @@ class TextCategorizer(Pipe): | |||
| 
 | ||||
|     def pipe(self, stream, batch_size=128): | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             scores, tensors = self.predict(docs) | ||||
|             self.set_annotations(docs, scores, tensors=tensors) | ||||
|             scores = self.predict(docs) | ||||
|             self.set_annotations(docs, scores) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|  | @ -826,22 +813,25 @@ class TextCategorizer(Pipe): | |||
|             # Handle cases where there are no tokens in any docs. | ||||
|             xp = get_array_module(tensors) | ||||
|             scores = xp.zeros((len(docs), len(self.labels))) | ||||
|             return scores, tensors | ||||
|             return scores | ||||
| 
 | ||||
|         scores = self.model.predict(docs) | ||||
|         scores = self.model.ops.asarray(scores) | ||||
|         return scores, tensors | ||||
|         return scores | ||||
| 
 | ||||
|     def set_annotations(self, docs, scores, tensors=None): | ||||
|     def set_annotations(self, docs, scores): | ||||
|         for i, doc in enumerate(docs): | ||||
|             for j, label in enumerate(self.labels): | ||||
|                 doc.cats[label] = float(scores[i, j]) | ||||
| 
 | ||||
|     def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): | ||||
|     def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         losses.setdefault(self.name, 0.0) | ||||
|         try: | ||||
|             if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): | ||||
|                 # Handle cases where there are no tokens in any docs. | ||||
|                 return | ||||
|                 return losses | ||||
|         except AttributeError: | ||||
|             types = set([type(eg) for eg in examples]) | ||||
|             raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) | ||||
|  | @ -853,12 +843,11 @@ class TextCategorizer(Pipe): | |||
|         bp_scores(d_scores) | ||||
|         if sgd is not None: | ||||
|             self.model.finish_update(sgd) | ||||
|         if losses is not None: | ||||
|             losses.setdefault(self.name, 0.0) | ||||
|             losses[self.name] += loss | ||||
|         losses[self.name] += loss | ||||
|         if set_annotations: | ||||
|             docs = [eg.predicted for eg in examples] | ||||
|             self.set_annotations(docs, scores=scores) | ||||
|         return losses | ||||
| 
 | ||||
|     def rehearse(self, examples, drop=0., sgd=None, losses=None): | ||||
|         if self._rehearsal_model is None: | ||||
|  | @ -1082,12 +1071,13 @@ class EntityLinker(Pipe): | |||
|             sgd = self.create_optimizer() | ||||
|         return sgd | ||||
| 
 | ||||
|     def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): | ||||
|     def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None): | ||||
|         self.require_kb() | ||||
|         if losses is not None: | ||||
|             losses.setdefault(self.name, 0.0) | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         losses.setdefault(self.name, 0.0) | ||||
|         if not examples: | ||||
|             return 0 | ||||
|             return losses | ||||
|         sentence_docs = [] | ||||
|         try: | ||||
|             docs = [eg.predicted for eg in examples] | ||||
|  | @ -1130,20 +1120,19 @@ class EntityLinker(Pipe): | |||
|             return 0.0 | ||||
|         sentence_encodings, bp_context = self.model.begin_update(sentence_docs) | ||||
|         loss, d_scores = self.get_similarity_loss( | ||||
|             scores=sentence_encodings, | ||||
|             sentence_encodings=sentence_encodings, | ||||
|             examples=examples | ||||
|         ) | ||||
|         bp_context(d_scores) | ||||
|         if sgd is not None: | ||||
|             self.model.finish_update(sgd) | ||||
| 
 | ||||
|         if losses is not None: | ||||
|             losses[self.name] += loss | ||||
|         losses[self.name] += loss | ||||
|         if set_annotations: | ||||
|             self.set_annotations(docs, predictions) | ||||
|         return loss | ||||
|         return losses | ||||
| 
 | ||||
|     def get_similarity_loss(self, examples, scores): | ||||
|     def get_similarity_loss(self, examples, sentence_encodings): | ||||
|         entity_encodings = [] | ||||
|         for eg in examples: | ||||
|             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) | ||||
|  | @ -1155,41 +1144,23 @@ class EntityLinker(Pipe): | |||
| 
 | ||||
|         entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") | ||||
| 
 | ||||
|         if scores.shape != entity_encodings.shape: | ||||
|         if sentence_encodings.shape != entity_encodings.shape: | ||||
|             raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) | ||||
| 
 | ||||
|         gradients = self.distance.get_grad(scores, entity_encodings) | ||||
|         loss = self.distance.get_loss(scores, entity_encodings) | ||||
|         gradients = self.distance.get_grad(sentence_encodings, entity_encodings) | ||||
|         loss = self.distance.get_loss(sentence_encodings, entity_encodings) | ||||
|         loss = loss / len(entity_encodings) | ||||
|         return loss, gradients | ||||
| 
 | ||||
|     def get_loss(self, examples, scores): | ||||
|         cats = [] | ||||
|         for eg in examples: | ||||
|             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) | ||||
|             for ent in eg.predicted.ents: | ||||
|                 kb_id = kb_ids[ent.start] | ||||
|                 if kb_id: | ||||
|                     cats.append([1.0]) | ||||
| 
 | ||||
|         cats = self.model.ops.asarray(cats, dtype="float32") | ||||
|         if len(scores) != len(cats): | ||||
|             raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up")) | ||||
| 
 | ||||
|         d_scores = (scores - cats) | ||||
|         loss = (d_scores ** 2).sum() | ||||
|         loss = loss / len(cats) | ||||
|         return loss, d_scores | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         kb_ids, tensors = self.predict([doc]) | ||||
|         self.set_annotations([doc], kb_ids, tensors=tensors) | ||||
|         kb_ids = self.predict([doc]) | ||||
|         self.set_annotations([doc], kb_ids) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128): | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             kb_ids, tensors = self.predict(docs) | ||||
|             self.set_annotations(docs, kb_ids, tensors=tensors) | ||||
|             kb_ids = self.predict(docs) | ||||
|             self.set_annotations(docs, kb_ids) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|  | @ -1197,10 +1168,9 @@ class EntityLinker(Pipe): | |||
|         self.require_kb() | ||||
|         entity_count = 0 | ||||
|         final_kb_ids = [] | ||||
|         final_tensors = [] | ||||
| 
 | ||||
|         if not docs: | ||||
|             return final_kb_ids, final_tensors | ||||
|             return final_kb_ids | ||||
| 
 | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|  | @ -1234,21 +1204,18 @@ class EntityLinker(Pipe): | |||
|                             if to_discard and ent.label_ in to_discard: | ||||
|                                 # ignoring this entity - setting to NIL | ||||
|                                 final_kb_ids.append(self.NIL) | ||||
|                                 final_tensors.append(sentence_encoding) | ||||
| 
 | ||||
|                             else: | ||||
|                                 candidates = self.kb.get_candidates(ent.text) | ||||
|                                 if not candidates: | ||||
|                                     # no prediction possible for this entity - setting to NIL | ||||
|                                     final_kb_ids.append(self.NIL) | ||||
|                                     final_tensors.append(sentence_encoding) | ||||
| 
 | ||||
|                                 elif len(candidates) == 1: | ||||
|                                     # shortcut for efficiency reasons: take the 1 candidate | ||||
| 
 | ||||
|                                     # TODO: thresholding | ||||
|                                     final_kb_ids.append(candidates[0].entity_) | ||||
|                                     final_tensors.append(sentence_encoding) | ||||
| 
 | ||||
|                                 else: | ||||
|                                     random.shuffle(candidates) | ||||
|  | @ -1277,14 +1244,13 @@ class EntityLinker(Pipe): | |||
|                                     best_index = scores.argmax().item() | ||||
|                                     best_candidate = candidates[best_index] | ||||
|                                     final_kb_ids.append(best_candidate.entity_) | ||||
|                                     final_tensors.append(sentence_encoding) | ||||
| 
 | ||||
|         if not (len(final_tensors) == len(final_kb_ids) == entity_count): | ||||
|         if not (len(final_kb_ids) == entity_count): | ||||
|             raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) | ||||
| 
 | ||||
|         return final_kb_ids, final_tensors | ||||
|         return final_kb_ids | ||||
| 
 | ||||
|     def set_annotations(self, docs, kb_ids, tensors=None): | ||||
|     def set_annotations(self, docs, kb_ids): | ||||
|         count_ents = len([ent for doc in docs for ent in doc.ents]) | ||||
|         if count_ents != len(kb_ids): | ||||
|             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) | ||||
|  | @ -1400,11 +1366,7 @@ class Sentencizer(Pipe): | |||
|     def pipe(self, stream, batch_size=128): | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             predictions = self.predict(docs) | ||||
|             if isinstance(predictions, tuple) and len(tuple) == 2: | ||||
|                 scores, tensors = predictions | ||||
|                 self.set_annotations(docs, scores, tensors=tensors) | ||||
|             else: | ||||
|                 self.set_annotations(docs, predictions) | ||||
|             self.set_annotations(docs, predictions) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|  | @ -1435,7 +1397,7 @@ class Sentencizer(Pipe): | |||
|             guesses.append(doc_guesses) | ||||
|         return guesses | ||||
| 
 | ||||
|     def set_annotations(self, docs, batch_tag_ids, tensors=None): | ||||
|     def set_annotations(self, docs, batch_tag_ids): | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         cdef Doc doc | ||||
|  |  | |||
|  | @ -57,7 +57,7 @@ class SimpleNER(Pipe): | |||
|         scores = self.model.predict(docs) | ||||
|         return scores | ||||
| 
 | ||||
|     def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None): | ||||
|     def set_annotations(self, docs: List[Doc], scores: List[Floats2d]): | ||||
|         """Set entities on a batch of documents from a batch of scores.""" | ||||
|         tag_names = self.get_tag_names() | ||||
|         for i, doc in enumerate(docs): | ||||
|  | @ -67,9 +67,12 @@ class SimpleNER(Pipe): | |||
|                 tags = iob_to_biluo(tags) | ||||
|             doc.ents = spans_from_biluo_tags(doc, tags) | ||||
| 
 | ||||
|     def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): | ||||
|     def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None): | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         losses.setdefault("ner", 0.0) | ||||
|         if not any(_has_ner(eg) for eg in examples): | ||||
|             return 0 | ||||
|             return losses | ||||
|         docs = [eg.predicted for eg in examples] | ||||
|         set_dropout_rate(self.model, drop) | ||||
|         scores, bp_scores = self.model.begin_update(docs) | ||||
|  | @ -79,10 +82,8 @@ class SimpleNER(Pipe): | |||
|             self.set_annotations(docs, scores) | ||||
|         if sgd is not None: | ||||
|             self.model.finish_update(sgd) | ||||
|         if losses is not None: | ||||
|             losses.setdefault("ner", 0.0) | ||||
|             losses["ner"] += loss | ||||
|         return loss | ||||
|         losses["ner"] += loss | ||||
|         return losses | ||||
| 
 | ||||
|     def get_loss(self, examples, scores): | ||||
|         loss = 0 | ||||
|  |  | |||
|  | @ -83,12 +83,14 @@ class Tok2Vec(Pipe): | |||
|             assert tokvecs.shape[0] == len(doc) | ||||
|             doc.tensor = tokvecs | ||||
| 
 | ||||
|     def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False): | ||||
|     def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False): | ||||
|         """Update the model. | ||||
|         examples (iterable): A batch of examples | ||||
|         examples (Iterable[Example]): A batch of examples | ||||
|         drop (float): The droput rate. | ||||
|         sgd (callable): An optimizer. | ||||
|         RETURNS (dict): Results from the update. | ||||
|         sgd (Optimizer): An optimizer. | ||||
|         losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. | ||||
|         set_annotations (bool): whether or not to update the examples with the predictions | ||||
|         RETURNS (Dict[str, float]): The updated losses dictionary | ||||
|         """ | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|  | @ -124,6 +126,7 @@ class Tok2Vec(Pipe): | |||
|         self.listeners[-1].receive(batch_id, tokvecs, backprop) | ||||
|         if set_annotations: | ||||
|             self.set_annotations(docs, tokvecs) | ||||
|         return losses | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         pass | ||||
|  |  | |||
|  | @ -222,7 +222,7 @@ class TrainingSchema(BaseModel): | |||
| class ProjectConfigAsset(BaseModel): | ||||
|     # fmt: off | ||||
|     dest: StrictStr = Field(..., title="Destination of downloaded asset") | ||||
|     url: StrictStr = Field(..., title="URL of asset") | ||||
|     url: Optional[StrictStr] = Field(None, title="URL of asset") | ||||
|     checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") | ||||
|     # fmt: on | ||||
| 
 | ||||
|  | @ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel): | |||
|     name: StrictStr = Field(..., title="Name of command") | ||||
|     help: Optional[StrictStr] = Field(None, title="Command description") | ||||
|     script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") | ||||
|     deps: List[StrictStr] = Field([], title="Data Version Control dependencies") | ||||
|     outputs: List[StrictStr] = Field([], title="Data Version Control outputs") | ||||
|     outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") | ||||
|     deps: List[StrictStr] = Field([], title="File dependencies required by this command") | ||||
|     outputs: List[StrictStr] = Field([], title="Outputs produced by this command") | ||||
|     outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)") | ||||
|     no_skip: bool = Field(False, title="Never skip this command, even if nothing changed") | ||||
|     # fmt: on | ||||
| 
 | ||||
|     class Config: | ||||
|  | @ -246,7 +247,7 @@ class ProjectConfigSchema(BaseModel): | |||
|     # fmt: off | ||||
|     variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") | ||||
|     assets: List[ProjectConfigAsset] = Field([], title="Data assets") | ||||
|     run: List[StrictStr] = Field([], title="Names of project commands to execute, in order") | ||||
|     workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") | ||||
|     commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") | ||||
|     # fmt: on | ||||
| 
 | ||||
|  |  | |||
|  | @ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no | |||
| 
 | ||||
| 
 | ||||
| class ParserStepModel(Model): | ||||
|     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True): | ||||
|     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True, | ||||
|             dropout=0.1): | ||||
|         Model.__init__(self, name="parser_step_model", forward=step_forward) | ||||
|         self.attrs["has_upper"] = has_upper | ||||
|         self.attrs["dropout_rate"] = dropout | ||||
|         self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) | ||||
|         if layers[1].get_dim("nP") >= 2: | ||||
|             activation = "maxout" | ||||
|  | @ -243,6 +245,13 @@ class ParserStepModel(Model): | |||
|             for class_ in unseen_classes: | ||||
|                 self._class_mask[class_] = 0. | ||||
| 
 | ||||
|     def clear_memory(self): | ||||
|         del self.tokvecs | ||||
|         del self.bp_tokvecs | ||||
|         del self.state2vec | ||||
|         del self.backprops | ||||
|         del self._class_mask | ||||
| 
 | ||||
|     @property | ||||
|     def nO(self): | ||||
|         if self.attrs["has_upper"]: | ||||
|  | @ -271,6 +280,19 @@ class ParserStepModel(Model): | |||
|             c_ids += ids.shape[1] | ||||
|         return ids | ||||
| 
 | ||||
|     def backprop_step(self, token_ids, d_vector, get_d_tokvecs): | ||||
|         if isinstance(self.state2vec.ops, CupyOps) \ | ||||
|         and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): | ||||
|             # Move token_ids and d_vector to GPU, asynchronously | ||||
|             self.backprops.append(( | ||||
|                 util.get_async(self.cuda_stream, token_ids), | ||||
|                 util.get_async(self.cuda_stream, d_vector), | ||||
|                 get_d_tokvecs | ||||
|             )) | ||||
|         else: | ||||
|             self.backprops.append((token_ids, d_vector, get_d_tokvecs)) | ||||
| 
 | ||||
| 
 | ||||
|     def finish_steps(self, golds): | ||||
|         # Add a padding vector to the d_tokvecs gradient, so that missing | ||||
|         # values don't affect the real gradient. | ||||
|  | @ -289,11 +311,17 @@ class ParserStepModel(Model): | |||
|         self.bp_tokvecs(d_tokvecs[:-1]) | ||||
|         return d_tokvecs | ||||
| 
 | ||||
| NUMPY_OPS = NumpyOps() | ||||
| 
 | ||||
| def step_forward(model: ParserStepModel, states, is_train): | ||||
|     token_ids = model.get_token_ids(states) | ||||
|     vector, get_d_tokvecs = model.state2vec(token_ids, is_train) | ||||
|     mask = None | ||||
|     if model.attrs["has_upper"]: | ||||
|         dropout_rate = model.attrs["dropout_rate"] | ||||
|         if is_train and dropout_rate > 0: | ||||
|             mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1) | ||||
|             vector *= mask | ||||
|         scores, get_d_vector = model.vec2scores(vector, is_train) | ||||
|     else: | ||||
|         scores = NumpyOps().asarray(vector) | ||||
|  | @ -305,16 +333,9 @@ def step_forward(model: ParserStepModel, states, is_train): | |||
|         # Zero vectors for unseen classes | ||||
|         d_scores *= model._class_mask | ||||
|         d_vector = get_d_vector(d_scores) | ||||
|         if isinstance(model.state2vec.ops, CupyOps) \ | ||||
|         and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): | ||||
|             # Move token_ids and d_vector to GPU, asynchronously | ||||
|             model.backprops.append(( | ||||
|                 util.get_async(model.cuda_stream, token_ids), | ||||
|                 util.get_async(model.cuda_stream, d_vector), | ||||
|                 get_d_tokvecs | ||||
|             )) | ||||
|         else: | ||||
|             model.backprops.append((token_ids, d_vector, get_d_tokvecs)) | ||||
|         if mask is not None: | ||||
|             d_vector *= mask | ||||
|         model.backprop_step(token_ids, d_vector, get_d_tokvecs) | ||||
|         return None | ||||
|     return scores, backprop_parser_step | ||||
| 
 | ||||
|  | @ -437,7 +458,7 @@ cdef class precompute_hiddens: | |||
|         sum_state_features(<float*>state_vector.data, | ||||
|             feat_weights, &ids[0,0], | ||||
|             token_ids.shape[0], self.nF, self.nO*self.nP) | ||||
|         state_vector = state_vector + self.bias | ||||
|         state_vector += self.bias | ||||
|         state_vector, bp_nonlinearity = self._nonlinearity(state_vector) | ||||
| 
 | ||||
|         def backward(d_state_vector_ids): | ||||
|  |  | |||
|  | @ -65,7 +65,6 @@ cdef class Parser: | |||
|             self.set_output(self.moves.n_moves) | ||||
|         self.cfg = dict(cfg) | ||||
|         self.cfg.setdefault("update_with_oracle_cut_size", 100) | ||||
|         self.cfg.setdefault("normalize_gradients_with_batch_size", True) | ||||
|         self._multitasks = [] | ||||
|         for multitask in cfg.get("multitasks", []): | ||||
|             self.add_multitask_objective(multitask) | ||||
|  | @ -154,7 +153,7 @@ cdef class Parser: | |||
|         doc (Doc): The document to be processed. | ||||
|         """ | ||||
|         states = self.predict([doc]) | ||||
|         self.set_annotations([doc], states, tensors=None) | ||||
|         self.set_annotations([doc], states) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, docs, int batch_size=256): | ||||
|  | @ -171,7 +170,7 @@ cdef class Parser: | |||
|             for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): | ||||
|                 subbatch = list(subbatch) | ||||
|                 parse_states = self.predict(subbatch) | ||||
|                 self.set_annotations(subbatch, parse_states, tensors=None) | ||||
|                 self.set_annotations(subbatch, parse_states) | ||||
|             yield from batch_in_order | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|  | @ -201,6 +200,8 @@ cdef class Parser: | |||
|         with nogil: | ||||
|             self._parseC(&states[0], | ||||
|                 weights, sizes) | ||||
|         model.clear_memory() | ||||
|         del model | ||||
|         return batch | ||||
| 
 | ||||
|     cdef void _parseC(self, StateC** states, | ||||
|  | @ -223,7 +224,7 @@ cdef class Parser: | |||
|             unfinished.clear() | ||||
|         free_activations(&activations) | ||||
| 
 | ||||
|     def set_annotations(self, docs, states, tensors=None): | ||||
|     def set_annotations(self, docs, states): | ||||
|         cdef StateClass state | ||||
|         cdef Doc doc | ||||
|         for i, (state, doc) in enumerate(zip(states, docs)): | ||||
|  | @ -264,7 +265,7 @@ cdef class Parser: | |||
|                 states[i].push_hist(guess) | ||||
|         free(is_valid) | ||||
| 
 | ||||
|     def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): | ||||
|     def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): | ||||
|         cdef StateClass state | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|  | @ -280,11 +281,12 @@ cdef class Parser: | |||
|             [eg.predicted for eg in examples]) | ||||
|         if self.cfg["update_with_oracle_cut_size"] >= 1: | ||||
|             # Chop sequences into lengths of this many transitions, to make the | ||||
|             # batch uniform length. We randomize this to overfit less. | ||||
|             # batch uniform length. | ||||
|             # We used to randomize this, but it's not clear that actually helps? | ||||
|             cut_size = self.cfg["update_with_oracle_cut_size"] | ||||
|             states, golds, max_steps = self._init_gold_batch( | ||||
|                 examples, | ||||
|                 max_length=numpy.random.choice(range(5, cut_size)) | ||||
|                 max_length=cut_size  | ||||
|             ) | ||||
|         else: | ||||
|             states, golds, _ = self.moves.init_gold_batch(examples) | ||||
|  | @ -292,24 +294,15 @@ cdef class Parser: | |||
|         if not states: | ||||
|             return losses | ||||
|         all_states = list(states) | ||||
|         states_golds = zip(states, golds) | ||||
|         for _ in range(max_steps): | ||||
|             if not states_golds: | ||||
|                 break | ||||
|         states_golds = list(zip(states, golds)) | ||||
|         while states_golds: | ||||
|             states, golds = zip(*states_golds) | ||||
|             scores, backprop = model.begin_update(states) | ||||
|             d_scores = self.get_batch_loss(states, golds, scores, losses) | ||||
|             if self.cfg["normalize_gradients_with_batch_size"]: | ||||
|                 # We have to be very careful how we do this, because of the way we | ||||
|                 # cut up the batch. We subdivide long sequences. If we normalize | ||||
|                 # naively, we end up normalizing by sequence length, which | ||||
|                 # is bad: that would mean that states in long sequences | ||||
|                 # consistently get smaller gradients. Imagine if we have two | ||||
|                 # sequences, one length 1000, one length 20. If we cut up | ||||
|                 # the 1k sequence so that we have a "batch" of 50 subsequences, | ||||
|                 # we don't want the gradients to get 50 times smaller! | ||||
|                 d_scores /= n_examples | ||||
| 
 | ||||
|             # Note that the gradient isn't normalized by the batch size | ||||
|             # here, because our "samples" are really the states...But we | ||||
|             # can't normalize by the number of states either, as then we'd | ||||
|             # be getting smaller gradients for states in long sequences. | ||||
|             backprop(d_scores) | ||||
|             # Follow the predicted action | ||||
|             self.transition_states(states, scores) | ||||
|  | @ -321,6 +314,13 @@ cdef class Parser: | |||
|         if set_annotations: | ||||
|             docs = [eg.predicted for eg in examples] | ||||
|             self.set_annotations(docs, all_states) | ||||
|         # Ugh, this is annoying. If we're working on GPU, we want to free the | ||||
|         # memory ASAP. It seems that Python doesn't necessarily get around to | ||||
|         # removing these in time if we don't explicitly delete? It's confusing. | ||||
|         del backprop | ||||
|         del backprop_tok2vec | ||||
|         model.clear_memory() | ||||
|         del model | ||||
|         return losses | ||||
| 
 | ||||
|     def rehearse(self, examples, sgd=None, losses=None, **cfg): | ||||
|  | @ -344,7 +344,7 @@ cdef class Parser: | |||
|         set_dropout_rate(self._rehearsal_model, 0.0) | ||||
|         set_dropout_rate(self.model, 0.0) | ||||
|         tutor, _ = self._rehearsal_model.begin_update(docs) | ||||
|         model, finish_update = self.model.begin_update(docs) | ||||
|         model, backprop_tok2vec = self.model.begin_update(docs) | ||||
|         n_scores = 0. | ||||
|         loss = 0. | ||||
|         while states: | ||||
|  | @ -360,10 +360,16 @@ cdef class Parser: | |||
|             states = [state for state in states if not state.is_final()] | ||||
|             n_scores += d_scores.size | ||||
|         # Do the backprop | ||||
|         finish_update(docs) | ||||
|         backprop_tok2vec(docs) | ||||
|         if sgd is not None: | ||||
|             self.model.finish_update(sgd) | ||||
|         losses[self.name] += loss / n_scores | ||||
|         del backprop | ||||
|         del backprop_tok2vec | ||||
|         model.clear_memory() | ||||
|         tutor.clear_memory() | ||||
|         del model | ||||
|         del tutor | ||||
|         return losses | ||||
| 
 | ||||
|     def get_gradients(self): | ||||
|  | @ -407,6 +413,7 @@ cdef class Parser: | |||
|             cpu_log_loss(c_d_scores, | ||||
|                 costs, is_valid, &scores[i, 0], d_scores.shape[1]) | ||||
|             c_d_scores += d_scores.shape[1] | ||||
|         # Note that we don't normalize this. See comment in update() for why. | ||||
|         if losses is not None: | ||||
|             losses.setdefault(self.name, 0.) | ||||
|             losses[self.name] += (d_scores**2).sum() | ||||
|  | @ -525,21 +532,25 @@ cdef class Parser: | |||
|             StateClass state | ||||
|             Transition action | ||||
|         all_states = self.moves.init_batch([eg.predicted for eg in examples]) | ||||
|         states = [] | ||||
|         golds = [] | ||||
|         kept = [] | ||||
|         max_length_seen = 0 | ||||
|         for state, eg in zip(all_states, examples): | ||||
|             if self.moves.has_gold(eg) and not state.is_final(): | ||||
|                 gold = self.moves.init_gold(state, eg) | ||||
|                 oracle_actions = self.moves.get_oracle_sequence_from_state( | ||||
|                     state.copy(), gold) | ||||
|                 kept.append((eg, state, gold, oracle_actions)) | ||||
|                 min_length = min(min_length, len(oracle_actions)) | ||||
|                 max_length_seen = max(max_length, len(oracle_actions)) | ||||
|                 if len(eg.x) < max_length: | ||||
|                     states.append(state) | ||||
|                     golds.append(gold) | ||||
|                 else: | ||||
|                     oracle_actions = self.moves.get_oracle_sequence_from_state( | ||||
|                         state.copy(), gold) | ||||
|                     kept.append((eg, state, gold, oracle_actions)) | ||||
|                     min_length = min(min_length, len(oracle_actions)) | ||||
|                     max_length_seen = max(max_length, len(oracle_actions)) | ||||
|         if not kept: | ||||
|             return [], [], 0 | ||||
|             return states, golds, 0 | ||||
|         max_length = max(min_length, min(max_length, max_length_seen)) | ||||
|         states = [] | ||||
|         golds = [] | ||||
|         cdef int clas | ||||
|         max_moves = 0 | ||||
|         for eg, state, gold, oracle_actions in kept: | ||||
|  |  | |||
|  | @ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): | |||
| 
 | ||||
| def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): | ||||
|     assert contains_cycle(tree) is None | ||||
|     assert contains_cycle(cyclic_tree) == set([3, 4, 5]) | ||||
|     assert contains_cycle(cyclic_tree) == {3, 4, 5} | ||||
|     assert contains_cycle(partial_tree) is None | ||||
|     assert contains_cycle(multirooted_tree) is None | ||||
| 
 | ||||
|  |  | |||
|  | @ -198,10 +198,10 @@ def test_overfitting_IO(): | |||
|     nlp.add_pipe(parser) | ||||
|     optimizer = nlp.begin_training() | ||||
| 
 | ||||
|     for i in range(50): | ||||
|     for i in range(100): | ||||
|         losses = {} | ||||
|         nlp.update(train_examples, sgd=optimizer, losses=losses) | ||||
|     assert losses["parser"] < 0.00001 | ||||
|     assert losses["parser"] < 0.0001 | ||||
| 
 | ||||
|     # test the trained model | ||||
|     test_text = "I like securities." | ||||
|  |  | |||
|  | @ -38,6 +38,11 @@ def test_overfitting_IO(): | |||
|     train_examples = [] | ||||
|     for t in TRAIN_DATA: | ||||
|         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) | ||||
|     # add some cases where SENT_START == -1 | ||||
|     train_examples[0].reference[10].is_sent_start = False | ||||
|     train_examples[1].reference[1].is_sent_start = False | ||||
|     train_examples[1].reference[11].is_sent_start = False | ||||
| 
 | ||||
|     nlp.add_pipe(senter) | ||||
|     optimizer = nlp.begin_training() | ||||
| 
 | ||||
|  |  | |||
|  | @ -84,7 +84,7 @@ def test_overfitting_IO(): | |||
|     # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly | ||||
|     fix_random_seed(0) | ||||
|     nlp = English() | ||||
|     textcat = nlp.create_pipe("textcat") | ||||
|     textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True}) | ||||
|     train_examples = [] | ||||
|     for text, annotations in TRAIN_DATA: | ||||
|         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) | ||||
|  |  | |||
|  | @ -23,6 +23,7 @@ def test_issue2070(): | |||
|     assert len(doc) == 11 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue2179(): | ||||
|     """Test that spurious 'extra_labels' aren't created when initializing NER.""" | ||||
|     nlp = Italian() | ||||
|  | @ -134,6 +135,7 @@ def test_issue2464(en_vocab): | |||
|     assert len(matches) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue2482(): | ||||
|     """Test we can serialize and deserialize a blank NER or parser model.""" | ||||
|     nlp = Italian() | ||||
|  |  | |||
|  | @ -138,13 +138,16 @@ def test_issue2782(text, lang_cls): | |||
|     assert doc[0].like_num | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue2800(): | ||||
|     """Test issue that arises when too many labels are added to NER model. | ||||
|     Used to cause segfault. | ||||
|     """ | ||||
|     nlp = English() | ||||
|     train_data = [] | ||||
|     train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]) | ||||
|     train_data.extend( | ||||
|         [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] | ||||
|     ) | ||||
|     entity_types = [str(i) for i in range(1000)] | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     nlp.add_pipe(ner) | ||||
|  |  | |||
|  | @ -88,6 +88,7 @@ def test_issue3199(): | |||
|     assert list(doc[0:3].noun_chunks) == [] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue3209(): | ||||
|     """Test issue that occurred in spaCy nightly where NER labels were being | ||||
|     mapped to classes incorrectly after loading the model, when the labels | ||||
|  |  | |||
							
								
								
									
										472
									
								
								spacy/tests/regression/test_issue3501-4000.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										472
									
								
								spacy/tests/regression/test_issue3501-4000.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,472 @@ | |||
| import pytest | ||||
| from spacy.language import Language | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.pipeline import EntityRuler, DependencyParser | ||||
| from spacy.pipeline.defaults import default_parser | ||||
| from spacy import displacy, load | ||||
| from spacy.displacy import parse_deps | ||||
| from spacy.tokens import Doc, Token | ||||
| from spacy.matcher import Matcher, PhraseMatcher | ||||
| from spacy.errors import MatchPatternError | ||||
| from spacy.util import minibatch | ||||
| from spacy.gold import Example | ||||
| from spacy.lang.hi import Hindi | ||||
| from spacy.lang.es import Spanish | ||||
| from spacy.lang.en import English | ||||
| from spacy.attrs import IS_ALPHA | ||||
| from thinc.api import compounding | ||||
| import spacy | ||||
| import srsly | ||||
| import numpy | ||||
| 
 | ||||
| from ..util import make_tempdir, get_doc | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) | ||||
| def test_issue3521(en_tokenizer, word): | ||||
|     tok = en_tokenizer(word)[1] | ||||
|     # 'not' and 'would' should be stopwords, also in their abbreviated forms | ||||
|     assert tok.is_stop | ||||
| 
 | ||||
| 
 | ||||
| def test_issue_3526_1(en_vocab): | ||||
|     patterns = [ | ||||
|         {"label": "HELLO", "pattern": "hello world"}, | ||||
|         {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, | ||||
|         {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, | ||||
|         {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, | ||||
|         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, | ||||
|     ] | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) | ||||
|     ruler_bytes = ruler.to_bytes() | ||||
|     assert len(ruler) == len(patterns) | ||||
|     assert len(ruler.labels) == 4 | ||||
|     assert ruler.overwrite | ||||
|     new_ruler = EntityRuler(nlp) | ||||
|     new_ruler = new_ruler.from_bytes(ruler_bytes) | ||||
|     assert len(new_ruler) == len(ruler) | ||||
|     assert len(new_ruler.labels) == 4 | ||||
|     assert new_ruler.overwrite == ruler.overwrite | ||||
|     assert new_ruler.ent_id_sep == ruler.ent_id_sep | ||||
| 
 | ||||
| 
 | ||||
| def test_issue_3526_2(en_vocab): | ||||
|     patterns = [ | ||||
|         {"label": "HELLO", "pattern": "hello world"}, | ||||
|         {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, | ||||
|         {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, | ||||
|         {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, | ||||
|         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, | ||||
|     ] | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) | ||||
|     bytes_old_style = srsly.msgpack_dumps(ruler.patterns) | ||||
|     new_ruler = EntityRuler(nlp) | ||||
|     new_ruler = new_ruler.from_bytes(bytes_old_style) | ||||
|     assert len(new_ruler) == len(ruler) | ||||
|     for pattern in ruler.patterns: | ||||
|         assert pattern in new_ruler.patterns | ||||
|     assert new_ruler.overwrite is not ruler.overwrite | ||||
| 
 | ||||
| 
 | ||||
| def test_issue_3526_3(en_vocab): | ||||
|     patterns = [ | ||||
|         {"label": "HELLO", "pattern": "hello world"}, | ||||
|         {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, | ||||
|         {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, | ||||
|         {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, | ||||
|         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, | ||||
|     ] | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) | ||||
|     with make_tempdir() as tmpdir: | ||||
|         out_file = tmpdir / "entity_ruler" | ||||
|         srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) | ||||
|         new_ruler = EntityRuler(nlp).from_disk(out_file) | ||||
|         for pattern in ruler.patterns: | ||||
|             assert pattern in new_ruler.patterns | ||||
|         assert len(new_ruler) == len(ruler) | ||||
|         assert new_ruler.overwrite is not ruler.overwrite | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue_3526_4(en_vocab): | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, overwrite_ents=True) | ||||
|     ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) | ||||
|     nlp.add_pipe(ruler) | ||||
|     with make_tempdir() as tmpdir: | ||||
|         nlp.to_disk(tmpdir) | ||||
|         ruler = nlp.get_pipe("entity_ruler") | ||||
|         assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] | ||||
|         assert ruler.overwrite is True | ||||
|         nlp2 = load(tmpdir) | ||||
|         new_ruler = nlp2.get_pipe("entity_ruler") | ||||
|         assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] | ||||
|         assert new_ruler.overwrite is True | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3531(): | ||||
|     """Test that displaCy renderer doesn't require "settings" key.""" | ||||
|     example_dep = { | ||||
|         "words": [ | ||||
|             {"text": "But", "tag": "CCONJ"}, | ||||
|             {"text": "Google", "tag": "PROPN"}, | ||||
|             {"text": "is", "tag": "VERB"}, | ||||
|             {"text": "starting", "tag": "VERB"}, | ||||
|             {"text": "from", "tag": "ADP"}, | ||||
|             {"text": "behind.", "tag": "ADV"}, | ||||
|         ], | ||||
|         "arcs": [ | ||||
|             {"start": 0, "end": 3, "label": "cc", "dir": "left"}, | ||||
|             {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, | ||||
|             {"start": 2, "end": 3, "label": "aux", "dir": "left"}, | ||||
|             {"start": 3, "end": 4, "label": "prep", "dir": "right"}, | ||||
|             {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, | ||||
|         ], | ||||
|     } | ||||
|     example_ent = { | ||||
|         "text": "But Google is starting from behind.", | ||||
|         "ents": [{"start": 4, "end": 10, "label": "ORG"}], | ||||
|     } | ||||
|     dep_html = displacy.render(example_dep, style="dep", manual=True) | ||||
|     assert dep_html | ||||
|     ent_html = displacy.render(example_ent, style="ent", manual=True) | ||||
|     assert ent_html | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3540(en_vocab): | ||||
|     words = ["I", "live", "in", "NewYork", "right", "now"] | ||||
|     tensor = numpy.asarray( | ||||
|         [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], | ||||
|         dtype="f", | ||||
|     ) | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     doc.tensor = tensor | ||||
|     gold_text = ["I", "live", "in", "NewYork", "right", "now"] | ||||
|     assert [token.text for token in doc] == gold_text | ||||
|     gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] | ||||
|     assert [token.lemma_ for token in doc] == gold_lemma | ||||
|     vectors_1 = [token.vector for token in doc] | ||||
|     assert len(vectors_1) == len(doc) | ||||
| 
 | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         heads = [(doc[3], 1), doc[2]] | ||||
|         attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} | ||||
|         retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) | ||||
| 
 | ||||
|     gold_text = ["I", "live", "in", "New", "York", "right", "now"] | ||||
|     assert [token.text for token in doc] == gold_text | ||||
|     gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] | ||||
|     assert [token.lemma_ for token in doc] == gold_lemma | ||||
|     vectors_2 = [token.vector for token in doc] | ||||
|     assert len(vectors_2) == len(doc) | ||||
|     assert vectors_1[0].tolist() == vectors_2[0].tolist() | ||||
|     assert vectors_1[1].tolist() == vectors_2[1].tolist() | ||||
|     assert vectors_1[2].tolist() == vectors_2[2].tolist() | ||||
|     assert vectors_1[4].tolist() == vectors_2[5].tolist() | ||||
|     assert vectors_1[5].tolist() == vectors_2[6].tolist() | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3549(en_vocab): | ||||
|     """Test that match pattern validation doesn't raise on empty errors.""" | ||||
|     matcher = Matcher(en_vocab, validate=True) | ||||
|     pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] | ||||
|     matcher.add("GOOD", [pattern]) | ||||
|     with pytest.raises(MatchPatternError): | ||||
|         matcher.add("BAD", [[{"X": "Y"}]]) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_issue3555(en_vocab): | ||||
|     """Test that custom extensions with default None don't break matcher.""" | ||||
|     Token.set_extension("issue3555", default=None) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] | ||||
|     matcher.add("TEST", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["have", "apple"]) | ||||
|     matcher(doc) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3611(): | ||||
|     """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ | ||||
|     unique_classes = ["offensive", "inoffensive"] | ||||
|     x_train = [ | ||||
|         "This is an offensive text", | ||||
|         "This is the second offensive text", | ||||
|         "inoff", | ||||
|     ] | ||||
|     y_train = ["offensive", "offensive", "inoffensive"] | ||||
|     nlp = spacy.blank("en") | ||||
|     # preparing the data | ||||
|     train_data = [] | ||||
|     for text, train_instance in zip(x_train, y_train): | ||||
|         cat_dict = {label: label == train_instance for label in unique_classes} | ||||
|         train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) | ||||
|     # add a text categorizer component | ||||
|     textcat = nlp.create_pipe( | ||||
|         "textcat", | ||||
|         config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, | ||||
|     ) | ||||
|     for label in unique_classes: | ||||
|         textcat.add_label(label) | ||||
|     nlp.add_pipe(textcat, last=True) | ||||
|     # training the network | ||||
|     with nlp.select_pipes(enable="textcat"): | ||||
|         optimizer = nlp.begin_training(X=x_train, Y=y_train) | ||||
|         for i in range(3): | ||||
|             losses = {} | ||||
|             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) | ||||
| 
 | ||||
|             for batch in batches: | ||||
|                 nlp.update( | ||||
|                     examples=batch, sgd=optimizer, drop=0.1, losses=losses, | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3625(): | ||||
|     """Test that default punctuation rules applies to hindi unicode characters""" | ||||
|     nlp = Hindi() | ||||
|     doc = nlp("hi. how हुए. होटल, होटल") | ||||
|     expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] | ||||
|     assert [token.text for token in doc] == expected | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3803(): | ||||
|     """Test that spanish num-like tokens have True for like_num attribute.""" | ||||
|     nlp = Spanish() | ||||
|     text = "2 dos 1000 mil 12 doce" | ||||
|     doc = nlp(text) | ||||
| 
 | ||||
|     assert [t.like_num for t in doc] == [True, True, True, True, True, True] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue3830_no_subtok(): | ||||
|     """Test that the parser doesn't have subtok label if not learn_tokens""" | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(Vocab(), default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     assert "subtok" not in parser.labels | ||||
|     parser.begin_training(lambda: []) | ||||
|     assert "subtok" not in parser.labels | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue3830_with_subtok(): | ||||
|     """Test that the parser does have subtok label if learn_tokens=True.""" | ||||
|     config = { | ||||
|         "learn_tokens": True, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(Vocab(), default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     assert "subtok" not in parser.labels | ||||
|     parser.begin_training(lambda: []) | ||||
|     assert "subtok" in parser.labels | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3839(en_vocab): | ||||
|     """Test that match IDs returned by the matcher are correct, are in the string """ | ||||
|     doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     match_id = "PATTERN" | ||||
|     pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] | ||||
|     pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] | ||||
|     matcher.add(match_id, [pattern1]) | ||||
|     matches = matcher(doc) | ||||
|     assert matches[0][0] == en_vocab.strings[match_id] | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add(match_id, [pattern2]) | ||||
|     matches = matcher(doc) | ||||
|     assert matches[0][0] == en_vocab.strings[match_id] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "sentence", | ||||
|     [ | ||||
|         "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", | ||||
|         "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", | ||||
|         "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", | ||||
|         "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", | ||||
|         "It was a missed assignment, but it shouldn't have resulted in a turnover ...", | ||||
|     ], | ||||
| ) | ||||
| def test_issue3869(sentence): | ||||
|     """Test that the Doc's count_by function works consistently""" | ||||
|     nlp = English() | ||||
|     doc = nlp(sentence) | ||||
|     count = 0 | ||||
|     for token in doc: | ||||
|         count += token.is_alpha | ||||
|     assert count == doc.count_by(IS_ALPHA).get(1, 0) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3879(en_vocab): | ||||
|     doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) | ||||
|     assert len(doc) == 5 | ||||
|     pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [pattern]) | ||||
|     assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test' | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue3880(): | ||||
|     """Test that `nlp.pipe()` works when an empty string ends the batch. | ||||
| 
 | ||||
|     Fixed in v7.0.5 of Thinc. | ||||
|     """ | ||||
|     texts = ["hello", "world", "", ""] | ||||
|     nlp = English() | ||||
|     nlp.add_pipe(nlp.create_pipe("parser")) | ||||
|     nlp.add_pipe(nlp.create_pipe("ner")) | ||||
|     nlp.add_pipe(nlp.create_pipe("tagger")) | ||||
|     nlp.get_pipe("parser").add_label("dep") | ||||
|     nlp.get_pipe("ner").add_label("PERSON") | ||||
|     nlp.get_pipe("tagger").add_label("NN") | ||||
|     nlp.begin_training() | ||||
|     for doc in nlp.pipe(texts): | ||||
|         pass | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3882(en_vocab): | ||||
|     """Test that displaCy doesn't serialize the doc.user_data when making a | ||||
|     copy of the Doc. | ||||
|     """ | ||||
|     doc = Doc(en_vocab, words=["Hello", "world"]) | ||||
|     doc.is_parsed = True | ||||
|     doc.user_data["test"] = set() | ||||
|     parse_deps(doc) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3951(en_vocab): | ||||
|     """Test that combinations of optional rules are matched correctly.""" | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [ | ||||
|         {"LOWER": "hello"}, | ||||
|         {"LOWER": "this", "OP": "?"}, | ||||
|         {"OP": "?"}, | ||||
|         {"LOWER": "world"}, | ||||
|     ] | ||||
|     matcher.add("TEST", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3959(): | ||||
|     """ Ensure that a modified pos attribute is serialized correctly.""" | ||||
|     nlp = English() | ||||
|     doc = nlp( | ||||
|         "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" | ||||
|     ) | ||||
|     assert doc[0].pos_ == "" | ||||
|     doc[0].pos_ = "NOUN" | ||||
|     assert doc[0].pos_ == "NOUN" | ||||
|     # usually this is already True when starting from proper models instead of blank English | ||||
|     doc.is_tagged = True | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         file_path = tmp_dir / "my_doc" | ||||
|         doc.to_disk(file_path) | ||||
|         doc2 = nlp("") | ||||
|         doc2.from_disk(file_path) | ||||
|         assert doc2[0].pos_ == "NOUN" | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3962(en_vocab): | ||||
|     """ Ensure that as_doc does not result in out-of-bound access of tokens. | ||||
|     This is achieved by setting the head to itself if it would lie out of the span otherwise.""" | ||||
|     # fmt: off | ||||
|     words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] | ||||
|     heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] | ||||
|     deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] | ||||
|     # fmt: on | ||||
|     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     span2 = doc[1:5]  # "jests at scars ," | ||||
|     doc2 = span2.as_doc() | ||||
|     doc2_json = doc2.to_json() | ||||
|     assert doc2_json | ||||
|     # head set to itself, being the new artificial root | ||||
|     assert doc2[0].head.text == "jests" | ||||
|     assert doc2[0].dep_ == "dep" | ||||
|     assert doc2[1].head.text == "jests" | ||||
|     assert doc2[1].dep_ == "prep" | ||||
|     assert doc2[2].head.text == "at" | ||||
|     assert doc2[2].dep_ == "pobj" | ||||
|     assert doc2[3].head.text == "jests"  # head set to the new artificial root | ||||
|     assert doc2[3].dep_ == "dep" | ||||
|     # We should still have 1 sentence | ||||
|     assert len(list(doc2.sents)) == 1 | ||||
|     span3 = doc[6:9]  # "never felt a" | ||||
|     doc3 = span3.as_doc() | ||||
|     doc3_json = doc3.to_json() | ||||
|     assert doc3_json | ||||
|     assert doc3[0].head.text == "felt" | ||||
|     assert doc3[0].dep_ == "neg" | ||||
|     assert doc3[1].head.text == "felt" | ||||
|     assert doc3[1].dep_ == "ROOT" | ||||
|     assert doc3[2].head.text == "felt"  # head set to ancestor | ||||
|     assert doc3[2].dep_ == "dep" | ||||
|     # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" | ||||
|     assert len(list(doc3.sents)) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3962_long(en_vocab): | ||||
|     """ Ensure that as_doc does not result in out-of-bound access of tokens. | ||||
|     This is achieved by setting the head to itself if it would lie out of the span otherwise.""" | ||||
|     # fmt: off | ||||
|     words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] | ||||
|     heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] | ||||
|     deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] | ||||
|     # fmt: on | ||||
|     two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     span2 = two_sent_doc[1:7]  # "jests at scars. They never" | ||||
|     doc2 = span2.as_doc() | ||||
|     doc2_json = doc2.to_json() | ||||
|     assert doc2_json | ||||
|     # head set to itself, being the new artificial root (in sentence 1) | ||||
|     assert doc2[0].head.text == "jests" | ||||
|     assert doc2[0].dep_ == "ROOT" | ||||
|     assert doc2[1].head.text == "jests" | ||||
|     assert doc2[1].dep_ == "prep" | ||||
|     assert doc2[2].head.text == "at" | ||||
|     assert doc2[2].dep_ == "pobj" | ||||
|     assert doc2[3].head.text == "jests" | ||||
|     assert doc2[3].dep_ == "punct" | ||||
|     # head set to itself, being the new artificial root (in sentence 2) | ||||
|     assert doc2[4].head.text == "They" | ||||
|     assert doc2[4].dep_ == "dep" | ||||
|     # head set to the new artificial head (in sentence 2) | ||||
|     assert doc2[4].head.text == "They" | ||||
|     assert doc2[4].dep_ == "dep" | ||||
|     # We should still have 2 sentences | ||||
|     sents = list(doc2.sents) | ||||
|     assert len(sents) == 2 | ||||
|     assert sents[0].text == "jests at scars ." | ||||
|     assert sents[1].text == "They never" | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3972(en_vocab): | ||||
|     """Test that the PhraseMatcher returns duplicates for duplicate match IDs. | ||||
|     """ | ||||
|     matcher = PhraseMatcher(en_vocab) | ||||
|     matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) | ||||
|     matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) | ||||
|     doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) | ||||
|     matches = matcher(doc) | ||||
| 
 | ||||
|     assert len(matches) == 2 | ||||
| 
 | ||||
|     # We should have a match for each of the two rules | ||||
|     found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] | ||||
|     assert "A" in found_ids | ||||
|     assert "B" in found_ids | ||||
|  | @ -1,8 +0,0 @@ | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) | ||||
| def test_issue3521(en_tokenizer, word): | ||||
|     tok = en_tokenizer(word)[1] | ||||
|     # 'not' and 'would' should be stopwords, also in their abbreviated forms | ||||
|     assert tok.is_stop | ||||
|  | @ -1,85 +0,0 @@ | |||
| import pytest | ||||
| from spacy.tokens import Span | ||||
| from spacy.language import Language | ||||
| from spacy.pipeline import EntityRuler | ||||
| from spacy import load | ||||
| import srsly | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def patterns(): | ||||
|     return [ | ||||
|         {"label": "HELLO", "pattern": "hello world"}, | ||||
|         {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, | ||||
|         {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, | ||||
|         {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, | ||||
|         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, | ||||
|     ] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def add_ent(): | ||||
|     def add_ent_component(doc): | ||||
|         doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])] | ||||
|         return doc | ||||
| 
 | ||||
|     return add_ent_component | ||||
| 
 | ||||
| 
 | ||||
| def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab): | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) | ||||
|     ruler_bytes = ruler.to_bytes() | ||||
|     assert len(ruler) == len(patterns) | ||||
|     assert len(ruler.labels) == 4 | ||||
|     assert ruler.overwrite | ||||
|     new_ruler = EntityRuler(nlp) | ||||
|     new_ruler = new_ruler.from_bytes(ruler_bytes) | ||||
|     assert len(new_ruler) == len(ruler) | ||||
|     assert len(new_ruler.labels) == 4 | ||||
|     assert new_ruler.overwrite == ruler.overwrite | ||||
|     assert new_ruler.ent_id_sep == ruler.ent_id_sep | ||||
| 
 | ||||
| 
 | ||||
| def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab): | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) | ||||
|     bytes_old_style = srsly.msgpack_dumps(ruler.patterns) | ||||
|     new_ruler = EntityRuler(nlp) | ||||
|     new_ruler = new_ruler.from_bytes(bytes_old_style) | ||||
|     assert len(new_ruler) == len(ruler) | ||||
|     for pattern in ruler.patterns: | ||||
|         assert pattern in new_ruler.patterns | ||||
|     assert new_ruler.overwrite is not ruler.overwrite | ||||
| 
 | ||||
| 
 | ||||
| def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) | ||||
|     with make_tempdir() as tmpdir: | ||||
|         out_file = tmpdir / "entity_ruler" | ||||
|         srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) | ||||
|         new_ruler = EntityRuler(nlp).from_disk(out_file) | ||||
|         for pattern in ruler.patterns: | ||||
|             assert pattern in new_ruler.patterns | ||||
|         assert len(new_ruler) == len(ruler) | ||||
|         assert new_ruler.overwrite is not ruler.overwrite | ||||
| 
 | ||||
| 
 | ||||
| def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): | ||||
|     nlp = Language(vocab=en_vocab) | ||||
|     ruler = EntityRuler(nlp, overwrite_ents=True) | ||||
| 
 | ||||
|     ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) | ||||
|     nlp.add_pipe(ruler) | ||||
|     with make_tempdir() as tmpdir: | ||||
|         nlp.to_disk(tmpdir) | ||||
|         ruler = nlp.get_pipe("entity_ruler") | ||||
|         assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] | ||||
|         assert ruler.overwrite is True | ||||
|         nlp2 = load(tmpdir) | ||||
|         new_ruler = nlp2.get_pipe("entity_ruler") | ||||
|         assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] | ||||
|         assert new_ruler.overwrite is True | ||||
|  | @ -1,30 +0,0 @@ | |||
| from spacy import displacy | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3531(): | ||||
|     """Test that displaCy renderer doesn't require "settings" key.""" | ||||
|     example_dep = { | ||||
|         "words": [ | ||||
|             {"text": "But", "tag": "CCONJ"}, | ||||
|             {"text": "Google", "tag": "PROPN"}, | ||||
|             {"text": "is", "tag": "VERB"}, | ||||
|             {"text": "starting", "tag": "VERB"}, | ||||
|             {"text": "from", "tag": "ADP"}, | ||||
|             {"text": "behind.", "tag": "ADV"}, | ||||
|         ], | ||||
|         "arcs": [ | ||||
|             {"start": 0, "end": 3, "label": "cc", "dir": "left"}, | ||||
|             {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, | ||||
|             {"start": 2, "end": 3, "label": "aux", "dir": "left"}, | ||||
|             {"start": 3, "end": 4, "label": "prep", "dir": "right"}, | ||||
|             {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, | ||||
|         ], | ||||
|     } | ||||
|     example_ent = { | ||||
|         "text": "But Google is starting from behind.", | ||||
|         "ents": [{"start": 4, "end": 10, "label": "ORG"}], | ||||
|     } | ||||
|     dep_html = displacy.render(example_dep, style="dep", manual=True) | ||||
|     assert dep_html | ||||
|     ent_html = displacy.render(example_ent, style="ent", manual=True) | ||||
|     assert ent_html | ||||
|  | @ -1,44 +0,0 @@ | |||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3540(en_vocab): | ||||
| 
 | ||||
|     words = ["I", "live", "in", "NewYork", "right", "now"] | ||||
|     tensor = np.asarray( | ||||
|         [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], | ||||
|         dtype="f", | ||||
|     ) | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     doc.tensor = tensor | ||||
| 
 | ||||
|     gold_text = ["I", "live", "in", "NewYork", "right", "now"] | ||||
|     assert [token.text for token in doc] == gold_text | ||||
| 
 | ||||
|     gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] | ||||
|     assert [token.lemma_ for token in doc] == gold_lemma | ||||
| 
 | ||||
|     vectors_1 = [token.vector for token in doc] | ||||
|     assert len(vectors_1) == len(doc) | ||||
| 
 | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         heads = [(doc[3], 1), doc[2]] | ||||
|         attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} | ||||
|         retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) | ||||
| 
 | ||||
|     gold_text = ["I", "live", "in", "New", "York", "right", "now"] | ||||
|     assert [token.text for token in doc] == gold_text | ||||
| 
 | ||||
|     gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] | ||||
|     assert [token.lemma_ for token in doc] == gold_lemma | ||||
| 
 | ||||
|     vectors_2 = [token.vector for token in doc] | ||||
|     assert len(vectors_2) == len(doc) | ||||
| 
 | ||||
|     assert vectors_1[0].tolist() == vectors_2[0].tolist() | ||||
|     assert vectors_1[1].tolist() == vectors_2[1].tolist() | ||||
|     assert vectors_1[2].tolist() == vectors_2[2].tolist() | ||||
| 
 | ||||
|     assert vectors_1[4].tolist() == vectors_2[5].tolist() | ||||
|     assert vectors_1[5].tolist() == vectors_2[6].tolist() | ||||
|  | @ -1,12 +0,0 @@ | |||
| import pytest | ||||
| from spacy.matcher import Matcher | ||||
| from spacy.errors import MatchPatternError | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3549(en_vocab): | ||||
|     """Test that match pattern validation doesn't raise on empty errors.""" | ||||
|     matcher = Matcher(en_vocab, validate=True) | ||||
|     pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] | ||||
|     matcher.add("GOOD", [pattern]) | ||||
|     with pytest.raises(MatchPatternError): | ||||
|         matcher.add("BAD", [[{"X": "Y"}]]) | ||||
|  | @ -1,14 +0,0 @@ | |||
| import pytest | ||||
| from spacy.tokens import Doc, Token | ||||
| from spacy.matcher import Matcher | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_issue3555(en_vocab): | ||||
|     """Test that custom extensions with default None don't break matcher.""" | ||||
|     Token.set_extension("issue3555", default=None) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] | ||||
|     matcher.add("TEST", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["have", "apple"]) | ||||
|     matcher(doc) | ||||
|  | @ -1,45 +0,0 @@ | |||
| import spacy | ||||
| from spacy.util import minibatch | ||||
| from thinc.api import compounding | ||||
| from spacy.gold import Example | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3611(): | ||||
|     """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ | ||||
|     unique_classes = ["offensive", "inoffensive"] | ||||
|     x_train = [ | ||||
|         "This is an offensive text", | ||||
|         "This is the second offensive text", | ||||
|         "inoff", | ||||
|     ] | ||||
|     y_train = ["offensive", "offensive", "inoffensive"] | ||||
| 
 | ||||
|     nlp = spacy.blank("en") | ||||
| 
 | ||||
|     # preparing the data | ||||
|     train_data = [] | ||||
|     for text, train_instance in zip(x_train, y_train): | ||||
|         cat_dict = {label: label == train_instance for label in unique_classes} | ||||
|         train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) | ||||
| 
 | ||||
|     # add a text categorizer component | ||||
|     textcat = nlp.create_pipe( | ||||
|         "textcat", | ||||
|         config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, | ||||
|     ) | ||||
| 
 | ||||
|     for label in unique_classes: | ||||
|         textcat.add_label(label) | ||||
|     nlp.add_pipe(textcat, last=True) | ||||
| 
 | ||||
|     # training the network | ||||
|     with nlp.select_pipes(enable="textcat"): | ||||
|         optimizer = nlp.begin_training(X=x_train, Y=y_train) | ||||
|         for i in range(3): | ||||
|             losses = {} | ||||
|             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) | ||||
| 
 | ||||
|             for batch in batches: | ||||
|                 nlp.update( | ||||
|                     examples=batch, sgd=optimizer, drop=0.1, losses=losses, | ||||
|                 ) | ||||
|  | @ -1,9 +0,0 @@ | |||
| from spacy.lang.hi import Hindi | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3625(): | ||||
|     """Test that default punctuation rules applies to hindi unicode characters""" | ||||
|     nlp = Hindi() | ||||
|     doc = nlp("hi. how हुए. होटल, होटल") | ||||
|     expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] | ||||
|     assert [token.text for token in doc] == expected | ||||
|  | @ -1,10 +0,0 @@ | |||
| from spacy.lang.es import Spanish | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3803(): | ||||
|     """Test that spanish num-like tokens have True for like_num attribute.""" | ||||
|     nlp = Spanish() | ||||
|     text = "2 dos 1000 mil 12 doce" | ||||
|     doc = nlp(text) | ||||
| 
 | ||||
|     assert [t.like_num for t in doc] == [True, True, True, True, True, True] | ||||
|  | @ -1,34 +0,0 @@ | |||
| from spacy.pipeline.pipes import DependencyParser | ||||
| from spacy.vocab import Vocab | ||||
| 
 | ||||
| from spacy.pipeline.defaults import default_parser | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3830_no_subtok(): | ||||
|     """Test that the parser doesn't have subtok label if not learn_tokens""" | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(Vocab(), default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     assert "subtok" not in parser.labels | ||||
|     parser.begin_training(lambda: []) | ||||
|     assert "subtok" not in parser.labels | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3830_with_subtok(): | ||||
|     """Test that the parser does have subtok label if learn_tokens=True.""" | ||||
|     config = { | ||||
|         "learn_tokens": True, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(Vocab(), default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     assert "subtok" not in parser.labels | ||||
|     parser.begin_training(lambda: []) | ||||
|     assert "subtok" in parser.labels | ||||
|  | @ -1,18 +0,0 @@ | |||
| from spacy.matcher import Matcher | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3839(en_vocab): | ||||
|     """Test that match IDs returned by the matcher are correct, are in the string """ | ||||
|     doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     match_id = "PATTERN" | ||||
|     pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] | ||||
|     pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] | ||||
|     matcher.add(match_id, [pattern1]) | ||||
|     matches = matcher(doc) | ||||
|     assert matches[0][0] == en_vocab.strings[match_id] | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add(match_id, [pattern2]) | ||||
|     matches = matcher(doc) | ||||
|     assert matches[0][0] == en_vocab.strings[match_id] | ||||
|  | @ -1,25 +0,0 @@ | |||
| import pytest | ||||
| from spacy.attrs import IS_ALPHA | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "sentence", | ||||
|     [ | ||||
|         "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", | ||||
|         "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", | ||||
|         "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", | ||||
|         "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", | ||||
|         "It was a missed assignment, but it shouldn't have resulted in a turnover ...", | ||||
|     ], | ||||
| ) | ||||
| def test_issue3869(sentence): | ||||
|     """Test that the Doc's count_by function works consistently""" | ||||
|     nlp = English() | ||||
|     doc = nlp(sentence) | ||||
| 
 | ||||
|     count = 0 | ||||
|     for token in doc: | ||||
|         count += token.is_alpha | ||||
| 
 | ||||
|     assert count == doc.count_by(IS_ALPHA).get(1, 0) | ||||
|  | @ -1,11 +0,0 @@ | |||
| from spacy.matcher import Matcher | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3879(en_vocab): | ||||
|     doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) | ||||
|     assert len(doc) == 5 | ||||
|     pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [pattern]) | ||||
|     assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test' | ||||
|  | @ -1,21 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue3880(): | ||||
|     """Test that `nlp.pipe()` works when an empty string ends the batch. | ||||
| 
 | ||||
|     Fixed in v7.0.5 of Thinc. | ||||
|     """ | ||||
|     texts = ["hello", "world", "", ""] | ||||
|     nlp = English() | ||||
|     nlp.add_pipe(nlp.create_pipe("parser")) | ||||
|     nlp.add_pipe(nlp.create_pipe("ner")) | ||||
|     nlp.add_pipe(nlp.create_pipe("tagger")) | ||||
|     nlp.get_pipe("parser").add_label("dep") | ||||
|     nlp.get_pipe("ner").add_label("PERSON") | ||||
|     nlp.get_pipe("tagger").add_label("NN") | ||||
|     nlp.begin_training() | ||||
|     for doc in nlp.pipe(texts): | ||||
|         pass | ||||
|  | @ -1,12 +0,0 @@ | |||
| from spacy.displacy import parse_deps | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3882(en_vocab): | ||||
|     """Test that displaCy doesn't serialize the doc.user_data when making a | ||||
|     copy of the Doc. | ||||
|     """ | ||||
|     doc = Doc(en_vocab, words=["Hello", "world"]) | ||||
|     doc.is_parsed = True | ||||
|     doc.user_data["test"] = set() | ||||
|     parse_deps(doc) | ||||
|  | @ -1,17 +0,0 @@ | |||
| from spacy.matcher import Matcher | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3951(en_vocab): | ||||
|     """Test that combinations of optional rules are matched correctly.""" | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [ | ||||
|         {"LOWER": "hello"}, | ||||
|         {"LOWER": "this", "OP": "?"}, | ||||
|         {"OP": "?"}, | ||||
|         {"LOWER": "world"}, | ||||
|     ] | ||||
|     matcher.add("TEST", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 0 | ||||
|  | @ -1,26 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3959(): | ||||
|     """ Ensure that a modified pos attribute is serialized correctly.""" | ||||
|     nlp = English() | ||||
|     doc = nlp( | ||||
|         "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" | ||||
|     ) | ||||
|     assert doc[0].pos_ == "" | ||||
| 
 | ||||
|     doc[0].pos_ = "NOUN" | ||||
|     assert doc[0].pos_ == "NOUN" | ||||
| 
 | ||||
|     # usually this is already True when starting from proper models instead of blank English | ||||
|     doc.is_tagged = True | ||||
| 
 | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         file_path = tmp_dir / "my_doc" | ||||
|         doc.to_disk(file_path) | ||||
| 
 | ||||
|         doc2 = nlp("") | ||||
|         doc2.from_disk(file_path) | ||||
| 
 | ||||
|         assert doc2[0].pos_ == "NOUN" | ||||
|  | @ -1,117 +0,0 @@ | |||
| import pytest | ||||
| 
 | ||||
| from ..util import get_doc | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def doc(en_tokenizer): | ||||
|     text = "He jests at scars, that never felt a wound." | ||||
|     heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] | ||||
|     deps = [ | ||||
|         "nsubj", | ||||
|         "ccomp", | ||||
|         "prep", | ||||
|         "pobj", | ||||
|         "punct", | ||||
|         "nsubj", | ||||
|         "neg", | ||||
|         "ROOT", | ||||
|         "det", | ||||
|         "dobj", | ||||
|         "punct", | ||||
|     ] | ||||
|     tokens = en_tokenizer(text) | ||||
|     return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3962(doc): | ||||
|     """ Ensure that as_doc does not result in out-of-bound access of tokens. | ||||
|     This is achieved by setting the head to itself if it would lie out of the span otherwise.""" | ||||
|     span2 = doc[1:5]  # "jests at scars ," | ||||
|     doc2 = span2.as_doc() | ||||
|     doc2_json = doc2.to_json() | ||||
|     assert doc2_json | ||||
| 
 | ||||
|     assert ( | ||||
|         doc2[0].head.text == "jests" | ||||
|     )  # head set to itself, being the new artificial root | ||||
|     assert doc2[0].dep_ == "dep" | ||||
|     assert doc2[1].head.text == "jests" | ||||
|     assert doc2[1].dep_ == "prep" | ||||
|     assert doc2[2].head.text == "at" | ||||
|     assert doc2[2].dep_ == "pobj" | ||||
|     assert doc2[3].head.text == "jests"  # head set to the new artificial root | ||||
|     assert doc2[3].dep_ == "dep" | ||||
| 
 | ||||
|     # We should still have 1 sentence | ||||
|     assert len(list(doc2.sents)) == 1 | ||||
| 
 | ||||
|     span3 = doc[6:9]  # "never felt a" | ||||
|     doc3 = span3.as_doc() | ||||
|     doc3_json = doc3.to_json() | ||||
|     assert doc3_json | ||||
| 
 | ||||
|     assert doc3[0].head.text == "felt" | ||||
|     assert doc3[0].dep_ == "neg" | ||||
|     assert doc3[1].head.text == "felt" | ||||
|     assert doc3[1].dep_ == "ROOT" | ||||
|     assert doc3[2].head.text == "felt"  # head set to ancestor | ||||
|     assert doc3[2].dep_ == "dep" | ||||
| 
 | ||||
|     # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" | ||||
|     assert len(list(doc3.sents)) == 1 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def two_sent_doc(en_tokenizer): | ||||
|     text = "He jests at scars. They never felt a wound." | ||||
|     heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] | ||||
|     deps = [ | ||||
|         "nsubj", | ||||
|         "ROOT", | ||||
|         "prep", | ||||
|         "pobj", | ||||
|         "punct", | ||||
|         "nsubj", | ||||
|         "neg", | ||||
|         "ROOT", | ||||
|         "det", | ||||
|         "dobj", | ||||
|         "punct", | ||||
|     ] | ||||
|     tokens = en_tokenizer(text) | ||||
|     return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3962_long(two_sent_doc): | ||||
|     """ Ensure that as_doc does not result in out-of-bound access of tokens. | ||||
|     This is achieved by setting the head to itself if it would lie out of the span otherwise.""" | ||||
|     span2 = two_sent_doc[1:7]  # "jests at scars. They never" | ||||
|     doc2 = span2.as_doc() | ||||
|     doc2_json = doc2.to_json() | ||||
|     assert doc2_json | ||||
| 
 | ||||
|     assert ( | ||||
|         doc2[0].head.text == "jests" | ||||
|     )  # head set to itself, being the new artificial root (in sentence 1) | ||||
|     assert doc2[0].dep_ == "ROOT" | ||||
|     assert doc2[1].head.text == "jests" | ||||
|     assert doc2[1].dep_ == "prep" | ||||
|     assert doc2[2].head.text == "at" | ||||
|     assert doc2[2].dep_ == "pobj" | ||||
|     assert doc2[3].head.text == "jests" | ||||
|     assert doc2[3].dep_ == "punct" | ||||
|     assert ( | ||||
|         doc2[4].head.text == "They" | ||||
|     )  # head set to itself, being the new artificial root (in sentence 2) | ||||
|     assert doc2[4].dep_ == "dep" | ||||
|     assert ( | ||||
|         doc2[4].head.text == "They" | ||||
|     )  # head set to the new artificial head (in sentence 2) | ||||
|     assert doc2[4].dep_ == "dep" | ||||
| 
 | ||||
|     # We should still have 2 sentences | ||||
|     sents = list(doc2.sents) | ||||
|     assert len(sents) == 2 | ||||
|     assert sents[0].text == "jests at scars ." | ||||
|     assert sents[1].text == "They never" | ||||
|  | @ -1,19 +0,0 @@ | |||
| from spacy.matcher import PhraseMatcher | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3972(en_vocab): | ||||
|     """Test that the PhraseMatcher returns duplicates for duplicate match IDs. | ||||
|     """ | ||||
|     matcher = PhraseMatcher(en_vocab) | ||||
|     matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) | ||||
|     matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) | ||||
|     doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) | ||||
|     matches = matcher(doc) | ||||
| 
 | ||||
|     assert len(matches) == 2 | ||||
| 
 | ||||
|     # We should have a match for each of the two rules | ||||
|     found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] | ||||
|     assert "A" in found_ids | ||||
|     assert "B" in found_ids | ||||
							
								
								
									
										469
									
								
								spacy/tests/regression/test_issue4001-4500.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										469
									
								
								spacy/tests/regression/test_issue4001-4500.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,469 @@ | |||
| import pytest | ||||
| from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe | ||||
| from spacy.pipeline.defaults import default_ner | ||||
| from spacy.matcher import PhraseMatcher, Matcher | ||||
| from spacy.tokens import Doc, Span, DocBin | ||||
| from spacy.gold import Example, Corpus | ||||
| from spacy.gold.converters import json2docs | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.lang.en import English | ||||
| from spacy.util import minibatch, ensure_path, load_model | ||||
| from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex | ||||
| from spacy.tokenizer import Tokenizer | ||||
| from spacy.lang.el import Greek | ||||
| from spacy.language import Language | ||||
| import spacy | ||||
| from thinc.api import compounding | ||||
| from collections import defaultdict | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4002(en_vocab): | ||||
|     """Test that the PhraseMatcher can match on overwritten NORM attributes. | ||||
|     """ | ||||
|     matcher = PhraseMatcher(en_vocab, attr="NORM") | ||||
|     pattern1 = Doc(en_vocab, words=["c", "d"]) | ||||
|     assert [t.norm_ for t in pattern1] == ["c", "d"] | ||||
|     matcher.add("TEST", [pattern1]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c", "d"]) | ||||
|     assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
|     matcher = PhraseMatcher(en_vocab, attr="NORM") | ||||
|     pattern2 = Doc(en_vocab, words=["1", "2"]) | ||||
|     pattern2[0].norm_ = "c" | ||||
|     pattern2[1].norm_ = "d" | ||||
|     assert [t.norm_ for t in pattern2] == ["c", "d"] | ||||
|     matcher.add("TEST", [pattern2]) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4030(): | ||||
|     """ Test whether textcat works fine with empty doc """ | ||||
|     unique_classes = ["offensive", "inoffensive"] | ||||
|     x_train = [ | ||||
|         "This is an offensive text", | ||||
|         "This is the second offensive text", | ||||
|         "inoff", | ||||
|     ] | ||||
|     y_train = ["offensive", "offensive", "inoffensive"] | ||||
|     nlp = spacy.blank("en") | ||||
|     # preparing the data | ||||
|     train_data = [] | ||||
|     for text, train_instance in zip(x_train, y_train): | ||||
|         cat_dict = {label: label == train_instance for label in unique_classes} | ||||
|         train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) | ||||
|     # add a text categorizer component | ||||
|     textcat = nlp.create_pipe( | ||||
|         "textcat", | ||||
|         config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, | ||||
|     ) | ||||
|     for label in unique_classes: | ||||
|         textcat.add_label(label) | ||||
|     nlp.add_pipe(textcat, last=True) | ||||
|     # training the network | ||||
|     with nlp.select_pipes(enable="textcat"): | ||||
|         optimizer = nlp.begin_training() | ||||
|         for i in range(3): | ||||
|             losses = {} | ||||
|             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) | ||||
| 
 | ||||
|             for batch in batches: | ||||
|                 nlp.update( | ||||
|                     examples=batch, sgd=optimizer, drop=0.1, losses=losses, | ||||
|                 ) | ||||
|     # processing of an empty doc should result in 0.0 for all categories | ||||
|     doc = nlp("") | ||||
|     assert doc.cats["offensive"] == 0.0 | ||||
|     assert doc.cats["inoffensive"] == 0.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4042(): | ||||
|     """Test that serialization of an EntityRuler before NER works fine.""" | ||||
|     nlp = English() | ||||
| 
 | ||||
|     # add ner pipe | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     ner.add_label("SOME_LABEL") | ||||
|     nlp.add_pipe(ner) | ||||
|     nlp.begin_training() | ||||
| 
 | ||||
|     # Add entity ruler | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [ | ||||
|         {"label": "MY_ORG", "pattern": "Apple"}, | ||||
|         {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, | ||||
|     ] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler, before="ner")  # works fine with "after" | ||||
|     doc1 = nlp("What do you think about Apple ?") | ||||
|     assert doc1.ents[0].label_ == "MY_ORG" | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         output_dir = ensure_path(d) | ||||
|         if not output_dir.exists(): | ||||
|             output_dir.mkdir() | ||||
|         nlp.to_disk(output_dir) | ||||
| 
 | ||||
|         nlp2 = load_model(output_dir) | ||||
|         doc2 = nlp2("What do you think about Apple ?") | ||||
|         assert doc2.ents[0].label_ == "MY_ORG" | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4042_bug2(): | ||||
|     """ | ||||
|     Test that serialization of an NER works fine when new labels were added. | ||||
|     This is the second bug of two bugs underlying the issue 4042. | ||||
|     """ | ||||
|     nlp1 = English() | ||||
|     vocab = nlp1.vocab | ||||
| 
 | ||||
|     # add ner pipe | ||||
|     ner1 = nlp1.create_pipe("ner") | ||||
|     ner1.add_label("SOME_LABEL") | ||||
|     nlp1.add_pipe(ner1) | ||||
|     nlp1.begin_training() | ||||
| 
 | ||||
|     # add a new label to the doc | ||||
|     doc1 = nlp1("What do you think about Apple ?") | ||||
|     assert len(ner1.labels) == 1 | ||||
|     assert "SOME_LABEL" in ner1.labels | ||||
|     apple_ent = Span(doc1, 5, 6, label="MY_ORG") | ||||
|     doc1.ents = list(doc1.ents) + [apple_ent] | ||||
| 
 | ||||
|     # reapply the NER - at this point it should resize itself | ||||
|     ner1(doc1) | ||||
|     assert len(ner1.labels) == 2 | ||||
|     assert "SOME_LABEL" in ner1.labels | ||||
|     assert "MY_ORG" in ner1.labels | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         # assert IO goes fine | ||||
|         output_dir = ensure_path(d) | ||||
|         if not output_dir.exists(): | ||||
|             output_dir.mkdir() | ||||
|         ner1.to_disk(output_dir) | ||||
| 
 | ||||
|         config = { | ||||
|             "learn_tokens": False, | ||||
|             "min_action_freq": 30, | ||||
|             "beam_width": 1, | ||||
|             "beam_update_prob": 1.0, | ||||
|         } | ||||
|         ner2 = EntityRecognizer(vocab, default_ner(), **config) | ||||
|         ner2.from_disk(output_dir) | ||||
|         assert len(ner2.labels) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4054(en_vocab): | ||||
|     """Test that a new blank model can be made with a vocab from file, | ||||
|     and that serialization does not drop the language at any point.""" | ||||
|     nlp1 = English() | ||||
|     vocab1 = nlp1.vocab | ||||
|     with make_tempdir() as d: | ||||
|         vocab_dir = ensure_path(d / "vocab") | ||||
|         if not vocab_dir.exists(): | ||||
|             vocab_dir.mkdir() | ||||
|         vocab1.to_disk(vocab_dir) | ||||
|         vocab2 = Vocab().from_disk(vocab_dir) | ||||
|         print("lang", vocab2.lang) | ||||
|         nlp2 = spacy.blank("en", vocab=vocab2) | ||||
|         nlp_dir = ensure_path(d / "nlp") | ||||
|         if not nlp_dir.exists(): | ||||
|             nlp_dir.mkdir() | ||||
|         nlp2.to_disk(nlp_dir) | ||||
|         nlp3 = load_model(nlp_dir) | ||||
|         assert nlp3.lang == "en" | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4120(en_vocab): | ||||
|     """Test that matches without a final {OP: ?} token are returned.""" | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) | ||||
|     doc1 = Doc(en_vocab, words=["a"]) | ||||
|     assert len(matcher(doc1)) == 1  # works | ||||
|     doc2 = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc2)) == 2  # fixed | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) | ||||
|     doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) | ||||
|     assert len(matcher(doc3)) == 2  # works | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) | ||||
|     doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) | ||||
|     assert len(matcher(doc4)) == 3  # fixed | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4133(en_vocab): | ||||
|     nlp = English() | ||||
|     vocab_bytes = nlp.vocab.to_bytes() | ||||
|     words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] | ||||
|     pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     for i, token in enumerate(doc): | ||||
|         token.pos_ = pos[i] | ||||
|     # usually this is already True when starting from proper models instead of blank English | ||||
|     doc.is_tagged = True | ||||
|     doc_bytes = doc.to_bytes() | ||||
|     vocab = Vocab() | ||||
|     vocab = vocab.from_bytes(vocab_bytes) | ||||
|     doc = Doc(vocab).from_bytes(doc_bytes) | ||||
|     actual = [] | ||||
|     for token in doc: | ||||
|         actual.append(token.pos_) | ||||
|     assert actual == pos | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4190(): | ||||
|     def customize_tokenizer(nlp): | ||||
|         prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) | ||||
|         suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) | ||||
|         infix_re = compile_infix_regex(nlp.Defaults.infixes) | ||||
|         # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') | ||||
|         exceptions = { | ||||
|             k: v | ||||
|             for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() | ||||
|             if not (len(k) == 2 and k[1] == ".") | ||||
|         } | ||||
|         new_tokenizer = Tokenizer( | ||||
|             nlp.vocab, | ||||
|             exceptions, | ||||
|             prefix_search=prefix_re.search, | ||||
|             suffix_search=suffix_re.search, | ||||
|             infix_finditer=infix_re.finditer, | ||||
|             token_match=nlp.tokenizer.token_match, | ||||
|         ) | ||||
|         nlp.tokenizer = new_tokenizer | ||||
| 
 | ||||
|     test_string = "Test c." | ||||
|     # Load default language | ||||
|     nlp_1 = English() | ||||
|     doc_1a = nlp_1(test_string) | ||||
|     result_1a = [token.text for token in doc_1a]  # noqa: F841 | ||||
|     # Modify tokenizer | ||||
|     customize_tokenizer(nlp_1) | ||||
|     doc_1b = nlp_1(test_string) | ||||
|     result_1b = [token.text for token in doc_1b] | ||||
|     # Save and Reload | ||||
|     with make_tempdir() as model_dir: | ||||
|         nlp_1.to_disk(model_dir) | ||||
|         nlp_2 = load_model(model_dir) | ||||
|     # This should be the modified tokenizer | ||||
|     doc_2 = nlp_2(test_string) | ||||
|     result_2 = [token.text for token in doc_2] | ||||
|     assert result_1b == result_2 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4267(): | ||||
|     """ Test that running an entity_ruler after ner gives consistent results""" | ||||
|     nlp = English() | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     ner.add_label("PEOPLE") | ||||
|     nlp.add_pipe(ner) | ||||
|     nlp.begin_training() | ||||
|     assert "ner" in nlp.pipe_names | ||||
|     # assert that we have correct IOB annotations | ||||
|     doc1 = nlp("hi") | ||||
|     assert doc1.is_nered | ||||
|     for token in doc1: | ||||
|         assert token.ent_iob == 2 | ||||
|     # add entity ruler and run again | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
|     assert "entity_ruler" in nlp.pipe_names | ||||
|     assert "ner" in nlp.pipe_names | ||||
|     # assert that we still have correct IOB annotations | ||||
|     doc2 = nlp("hi") | ||||
|     assert doc2.is_nered | ||||
|     for token in doc2: | ||||
|         assert token.ent_iob == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4272(): | ||||
|     """Test that lookup table can be accessed from Token.lemma if no POS tags | ||||
|     are available.""" | ||||
|     nlp = Greek() | ||||
|     doc = nlp("Χθες") | ||||
|     assert doc[0].lemma_ | ||||
| 
 | ||||
| 
 | ||||
| def test_multiple_predictions(): | ||||
|     class DummyPipe(Pipe): | ||||
|         def __init__(self): | ||||
|             self.model = "dummy_model" | ||||
| 
 | ||||
|         def predict(self, docs): | ||||
|             return ([1, 2, 3], [4, 5, 6]) | ||||
| 
 | ||||
|         def set_annotations(self, docs, scores): | ||||
|             return docs | ||||
| 
 | ||||
|     nlp = Language() | ||||
|     doc = nlp.make_doc("foo") | ||||
|     dummy_pipe = DummyPipe() | ||||
|     dummy_pipe(doc) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor") | ||||
| def test_issue4313(): | ||||
|     """ This should not crash or exit with some strange error code """ | ||||
|     beam_width = 16 | ||||
|     beam_density = 0.0001 | ||||
|     nlp = English() | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(nlp.vocab, default_ner(), **config) | ||||
|     ner.add_label("SOME_LABEL") | ||||
|     ner.begin_training([]) | ||||
|     nlp.add_pipe(ner) | ||||
| 
 | ||||
|     # add a new label to the doc | ||||
|     doc = nlp("What do you think about Apple ?") | ||||
|     assert len(ner.labels) == 1 | ||||
|     assert "SOME_LABEL" in ner.labels | ||||
|     apple_ent = Span(doc, 5, 6, label="MY_ORG") | ||||
|     doc.ents = list(doc.ents) + [apple_ent] | ||||
| 
 | ||||
|     # ensure the beam_parse still works with the new label | ||||
|     docs = [doc] | ||||
|     beams = nlp.entity.beam_parse( | ||||
|         docs, beam_width=beam_width, beam_density=beam_density | ||||
|     ) | ||||
| 
 | ||||
|     for doc, beam in zip(docs, beams): | ||||
|         entity_scores = defaultdict(float) | ||||
|         for score, ents in nlp.entity.moves.get_beam_parses(beam): | ||||
|             for start, end, label in ents: | ||||
|                 entity_scores[(start, end, label)] += score | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4348(): | ||||
|     """Test that training the tagger with empty data, doesn't throw errors""" | ||||
|     nlp = English() | ||||
|     example = Example.from_dict(nlp.make_doc(""), {"tags": []}) | ||||
|     TRAIN_DATA = [example, example] | ||||
|     tagger = nlp.create_pipe("tagger") | ||||
|     nlp.add_pipe(tagger) | ||||
|     optimizer = nlp.begin_training() | ||||
|     for i in range(5): | ||||
|         losses = {} | ||||
|         batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) | ||||
|         for batch in batches: | ||||
|             nlp.update(batch, sgd=optimizer, losses=losses) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4367(): | ||||
|     """Test that docbin init goes well""" | ||||
|     DocBin() | ||||
|     DocBin(attrs=["LEMMA"]) | ||||
|     DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4373(): | ||||
|     """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" | ||||
|     matcher = Matcher(Vocab()) | ||||
|     assert isinstance(matcher.vocab, Vocab) | ||||
|     matcher = PhraseMatcher(Vocab()) | ||||
|     assert isinstance(matcher.vocab, Vocab) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4402(): | ||||
|     json_data = { | ||||
|         "id": 0, | ||||
|         "paragraphs": [ | ||||
|             { | ||||
|                 "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", | ||||
|                 "sentences": [ | ||||
|                     { | ||||
|                         "tokens": [ | ||||
|                             {"id": 0, "orth": "How", "ner": "O"}, | ||||
|                             {"id": 1, "orth": "should", "ner": "O"}, | ||||
|                             {"id": 2, "orth": "I", "ner": "O"}, | ||||
|                             {"id": 3, "orth": "cook", "ner": "O"}, | ||||
|                             {"id": 4, "orth": "bacon", "ner": "O"}, | ||||
|                             {"id": 5, "orth": "in", "ner": "O"}, | ||||
|                             {"id": 6, "orth": "an", "ner": "O"}, | ||||
|                             {"id": 7, "orth": "oven", "ner": "O"}, | ||||
|                             {"id": 8, "orth": "?", "ner": "O"}, | ||||
|                         ], | ||||
|                         "brackets": [], | ||||
|                     }, | ||||
|                     { | ||||
|                         "tokens": [ | ||||
|                             {"id": 9, "orth": "\n", "ner": "O"}, | ||||
|                             {"id": 10, "orth": "I", "ner": "O"}, | ||||
|                             {"id": 11, "orth": "'ve", "ner": "O"}, | ||||
|                             {"id": 12, "orth": "heard", "ner": "O"}, | ||||
|                             {"id": 13, "orth": "of", "ner": "O"}, | ||||
|                             {"id": 14, "orth": "people", "ner": "O"}, | ||||
|                             {"id": 15, "orth": "cooking", "ner": "O"}, | ||||
|                             {"id": 16, "orth": "bacon", "ner": "O"}, | ||||
|                             {"id": 17, "orth": "in", "ner": "O"}, | ||||
|                             {"id": 18, "orth": "an", "ner": "O"}, | ||||
|                             {"id": 19, "orth": "oven", "ner": "O"}, | ||||
|                             {"id": 20, "orth": ".", "ner": "O"}, | ||||
|                         ], | ||||
|                         "brackets": [], | ||||
|                     }, | ||||
|                 ], | ||||
|                 "cats": [ | ||||
|                     {"label": "baking", "value": 1.0}, | ||||
|                     {"label": "not_baking", "value": 0.0}, | ||||
|                 ], | ||||
|             }, | ||||
|             { | ||||
|                 "raw": "What is the difference between white and brown eggs?\n", | ||||
|                 "sentences": [ | ||||
|                     { | ||||
|                         "tokens": [ | ||||
|                             {"id": 0, "orth": "What", "ner": "O"}, | ||||
|                             {"id": 1, "orth": "is", "ner": "O"}, | ||||
|                             {"id": 2, "orth": "the", "ner": "O"}, | ||||
|                             {"id": 3, "orth": "difference", "ner": "O"}, | ||||
|                             {"id": 4, "orth": "between", "ner": "O"}, | ||||
|                             {"id": 5, "orth": "white", "ner": "O"}, | ||||
|                             {"id": 6, "orth": "and", "ner": "O"}, | ||||
|                             {"id": 7, "orth": "brown", "ner": "O"}, | ||||
|                             {"id": 8, "orth": "eggs", "ner": "O"}, | ||||
|                             {"id": 9, "orth": "?", "ner": "O"}, | ||||
|                         ], | ||||
|                         "brackets": [], | ||||
|                     }, | ||||
|                     {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, | ||||
|                 ], | ||||
|                 "cats": [ | ||||
|                     {"label": "baking", "value": 0.0}, | ||||
|                     {"label": "not_baking", "value": 1.0}, | ||||
|                 ], | ||||
|             }, | ||||
|         ], | ||||
|     } | ||||
|     nlp = English() | ||||
|     attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] | ||||
|     with make_tempdir() as tmpdir: | ||||
|         output_file = tmpdir / "test4402.spacy" | ||||
|         docs = json2docs([json_data]) | ||||
|         data = DocBin(docs=docs, attrs=attrs).to_bytes() | ||||
|         with output_file.open("wb") as file_: | ||||
|             file_.write(data) | ||||
|         corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) | ||||
| 
 | ||||
|         train_data = list(corpus.train_dataset(nlp)) | ||||
|         assert len(train_data) == 2 | ||||
| 
 | ||||
|         split_train_data = [] | ||||
|         for eg in train_data: | ||||
|             split_train_data.extend(eg.split_sents()) | ||||
|         assert len(split_train_data) == 4 | ||||
|  | @ -1,23 +0,0 @@ | |||
| from spacy.matcher import PhraseMatcher | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4002(en_vocab): | ||||
|     """Test that the PhraseMatcher can match on overwritten NORM attributes. | ||||
|     """ | ||||
|     matcher = PhraseMatcher(en_vocab, attr="NORM") | ||||
|     pattern1 = Doc(en_vocab, words=["c", "d"]) | ||||
|     assert [t.norm_ for t in pattern1] == ["c", "d"] | ||||
|     matcher.add("TEST", [pattern1]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c", "d"]) | ||||
|     assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
|     matcher = PhraseMatcher(en_vocab, attr="NORM") | ||||
|     pattern2 = Doc(en_vocab, words=["1", "2"]) | ||||
|     pattern2[0].norm_ = "c" | ||||
|     pattern2[1].norm_ = "d" | ||||
|     assert [t.norm_ for t in pattern2] == ["c", "d"] | ||||
|     matcher.add("TEST", [pattern2]) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
|  | @ -1,50 +0,0 @@ | |||
| import spacy | ||||
| from spacy.util import minibatch | ||||
| from thinc.api import compounding | ||||
| from spacy.gold import Example | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4030(): | ||||
|     """ Test whether textcat works fine with empty doc """ | ||||
|     unique_classes = ["offensive", "inoffensive"] | ||||
|     x_train = [ | ||||
|         "This is an offensive text", | ||||
|         "This is the second offensive text", | ||||
|         "inoff", | ||||
|     ] | ||||
|     y_train = ["offensive", "offensive", "inoffensive"] | ||||
| 
 | ||||
|     nlp = spacy.blank("en") | ||||
| 
 | ||||
|     # preparing the data | ||||
|     train_data = [] | ||||
|     for text, train_instance in zip(x_train, y_train): | ||||
|         cat_dict = {label: label == train_instance for label in unique_classes} | ||||
|         train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) | ||||
| 
 | ||||
|     # add a text categorizer component | ||||
|     textcat = nlp.create_pipe( | ||||
|         "textcat", | ||||
|         config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, | ||||
|     ) | ||||
| 
 | ||||
|     for label in unique_classes: | ||||
|         textcat.add_label(label) | ||||
|     nlp.add_pipe(textcat, last=True) | ||||
| 
 | ||||
|     # training the network | ||||
|     with nlp.select_pipes(enable="textcat"): | ||||
|         optimizer = nlp.begin_training() | ||||
|         for i in range(3): | ||||
|             losses = {} | ||||
|             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) | ||||
| 
 | ||||
|             for batch in batches: | ||||
|                 nlp.update( | ||||
|                     examples=batch, sgd=optimizer, drop=0.1, losses=losses, | ||||
|                 ) | ||||
| 
 | ||||
|     # processing of an empty doc should result in 0.0 for all categories | ||||
|     doc = nlp("") | ||||
|     assert doc.cats["offensive"] == 0.0 | ||||
|     assert doc.cats["inoffensive"] == 0.0 | ||||
|  | @ -1,85 +0,0 @@ | |||
| import spacy | ||||
| from spacy.pipeline import EntityRecognizer, EntityRuler | ||||
| from spacy.lang.en import English | ||||
| from spacy.tokens import Span | ||||
| from spacy.util import ensure_path | ||||
| from spacy.pipeline.defaults import default_ner | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4042(): | ||||
|     """Test that serialization of an EntityRuler before NER works fine.""" | ||||
|     nlp = English() | ||||
| 
 | ||||
|     # add ner pipe | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     ner.add_label("SOME_LABEL") | ||||
|     nlp.add_pipe(ner) | ||||
|     nlp.begin_training() | ||||
| 
 | ||||
|     # Add entity ruler | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [ | ||||
|         {"label": "MY_ORG", "pattern": "Apple"}, | ||||
|         {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, | ||||
|     ] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler, before="ner")  # works fine with "after" | ||||
|     doc1 = nlp("What do you think about Apple ?") | ||||
|     assert doc1.ents[0].label_ == "MY_ORG" | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         output_dir = ensure_path(d) | ||||
|         if not output_dir.exists(): | ||||
|             output_dir.mkdir() | ||||
|         nlp.to_disk(output_dir) | ||||
| 
 | ||||
|         nlp2 = spacy.load(output_dir) | ||||
|         doc2 = nlp2("What do you think about Apple ?") | ||||
|         assert doc2.ents[0].label_ == "MY_ORG" | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4042_bug2(): | ||||
|     """ | ||||
|     Test that serialization of an NER works fine when new labels were added. | ||||
|     This is the second bug of two bugs underlying the issue 4042. | ||||
|     """ | ||||
|     nlp1 = English() | ||||
|     vocab = nlp1.vocab | ||||
| 
 | ||||
|     # add ner pipe | ||||
|     ner1 = nlp1.create_pipe("ner") | ||||
|     ner1.add_label("SOME_LABEL") | ||||
|     nlp1.add_pipe(ner1) | ||||
|     nlp1.begin_training() | ||||
| 
 | ||||
|     # add a new label to the doc | ||||
|     doc1 = nlp1("What do you think about Apple ?") | ||||
|     assert len(ner1.labels) == 1 | ||||
|     assert "SOME_LABEL" in ner1.labels | ||||
|     apple_ent = Span(doc1, 5, 6, label="MY_ORG") | ||||
|     doc1.ents = list(doc1.ents) + [apple_ent] | ||||
| 
 | ||||
|     # reapply the NER - at this point it should resize itself | ||||
|     ner1(doc1) | ||||
|     assert len(ner1.labels) == 2 | ||||
|     assert "SOME_LABEL" in ner1.labels | ||||
|     assert "MY_ORG" in ner1.labels | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         # assert IO goes fine | ||||
|         output_dir = ensure_path(d) | ||||
|         if not output_dir.exists(): | ||||
|             output_dir.mkdir() | ||||
|         ner1.to_disk(output_dir) | ||||
| 
 | ||||
|         config = { | ||||
|             "learn_tokens": False, | ||||
|             "min_action_freq": 30, | ||||
|             "beam_width": 1, | ||||
|             "beam_update_prob": 1.0, | ||||
|         } | ||||
|         ner2 = EntityRecognizer(vocab, default_ner(), **config) | ||||
|         ner2.from_disk(output_dir) | ||||
|         assert len(ner2.labels) == 2 | ||||
|  | @ -1,30 +0,0 @@ | |||
| from spacy.vocab import Vocab | ||||
| import spacy | ||||
| from spacy.lang.en import English | ||||
| from spacy.util import ensure_path | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4054(en_vocab): | ||||
|     """Test that a new blank model can be made with a vocab from file, | ||||
|     and that serialization does not drop the language at any point.""" | ||||
|     nlp1 = English() | ||||
|     vocab1 = nlp1.vocab | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         vocab_dir = ensure_path(d / "vocab") | ||||
|         if not vocab_dir.exists(): | ||||
|             vocab_dir.mkdir() | ||||
|         vocab1.to_disk(vocab_dir) | ||||
| 
 | ||||
|         vocab2 = Vocab().from_disk(vocab_dir) | ||||
|         print("lang", vocab2.lang) | ||||
|         nlp2 = spacy.blank("en", vocab=vocab2) | ||||
| 
 | ||||
|         nlp_dir = ensure_path(d / "nlp") | ||||
|         if not nlp_dir.exists(): | ||||
|             nlp_dir.mkdir() | ||||
|         nlp2.to_disk(nlp_dir) | ||||
|         nlp3 = spacy.load(nlp_dir) | ||||
|         assert nlp3.lang == "en" | ||||
|  | @ -1,23 +0,0 @@ | |||
| from spacy.matcher import Matcher | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4120(en_vocab): | ||||
|     """Test that matches without a final {OP: ?} token are returned.""" | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) | ||||
|     doc1 = Doc(en_vocab, words=["a"]) | ||||
|     assert len(matcher(doc1)) == 1  # works | ||||
| 
 | ||||
|     doc2 = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc2)) == 2  # fixed | ||||
| 
 | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) | ||||
|     doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) | ||||
|     assert len(matcher(doc3)) == 2  # works | ||||
| 
 | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) | ||||
|     doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) | ||||
|     assert len(matcher(doc4)) == 3  # fixed | ||||
|  | @ -1,28 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.tokens import Doc | ||||
| from spacy.vocab import Vocab | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4133(en_vocab): | ||||
|     nlp = English() | ||||
|     vocab_bytes = nlp.vocab.to_bytes() | ||||
|     words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] | ||||
|     pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     for i, token in enumerate(doc): | ||||
|         token.pos_ = pos[i] | ||||
| 
 | ||||
|     # usually this is already True when starting from proper models instead of blank English | ||||
|     doc.is_tagged = True | ||||
| 
 | ||||
|     doc_bytes = doc.to_bytes() | ||||
| 
 | ||||
|     vocab = Vocab() | ||||
|     vocab = vocab.from_bytes(vocab_bytes) | ||||
|     doc = Doc(vocab).from_bytes(doc_bytes) | ||||
| 
 | ||||
|     actual = [] | ||||
|     for token in doc: | ||||
|         actual.append(token.pos_) | ||||
| 
 | ||||
|     assert actual == pos | ||||
|  | @ -1,46 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.tokenizer import Tokenizer | ||||
| from spacy import util | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4190(): | ||||
|     test_string = "Test c." | ||||
|     # Load default language | ||||
|     nlp_1 = English() | ||||
|     doc_1a = nlp_1(test_string) | ||||
|     result_1a = [token.text for token in doc_1a]  # noqa: F841 | ||||
|     # Modify tokenizer | ||||
|     customize_tokenizer(nlp_1) | ||||
|     doc_1b = nlp_1(test_string) | ||||
|     result_1b = [token.text for token in doc_1b] | ||||
|     # Save and Reload | ||||
|     with make_tempdir() as model_dir: | ||||
|         nlp_1.to_disk(model_dir) | ||||
|         nlp_2 = util.load_model(model_dir) | ||||
|     # This should be the modified tokenizer | ||||
|     doc_2 = nlp_2(test_string) | ||||
|     result_2 = [token.text for token in doc_2] | ||||
|     assert result_1b == result_2 | ||||
| 
 | ||||
| 
 | ||||
| def customize_tokenizer(nlp): | ||||
|     prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes) | ||||
|     suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes) | ||||
|     infix_re = util.compile_infix_regex(nlp.Defaults.infixes) | ||||
|     # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') | ||||
|     exceptions = { | ||||
|         k: v | ||||
|         for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() | ||||
|         if not (len(k) == 2 and k[1] == ".") | ||||
|     } | ||||
|     new_tokenizer = Tokenizer( | ||||
|         nlp.vocab, | ||||
|         exceptions, | ||||
|         prefix_search=prefix_re.search, | ||||
|         suffix_search=suffix_re.search, | ||||
|         infix_finditer=infix_re.finditer, | ||||
|         token_match=nlp.tokenizer.token_match, | ||||
|     ) | ||||
|     nlp.tokenizer = new_tokenizer | ||||
|  | @ -1,34 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.pipeline import EntityRuler | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4267(): | ||||
|     """ Test that running an entity_ruler after ner gives consistent results""" | ||||
|     nlp = English() | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     ner.add_label("PEOPLE") | ||||
|     nlp.add_pipe(ner) | ||||
|     nlp.begin_training() | ||||
| 
 | ||||
|     assert "ner" in nlp.pipe_names | ||||
| 
 | ||||
|     # assert that we have correct IOB annotations | ||||
|     doc1 = nlp("hi") | ||||
|     assert doc1.is_nered | ||||
|     for token in doc1: | ||||
|         assert token.ent_iob == 2 | ||||
| 
 | ||||
|     # add entity ruler and run again | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] | ||||
| 
 | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
|     assert "entity_ruler" in nlp.pipe_names | ||||
|     assert "ner" in nlp.pipe_names | ||||
| 
 | ||||
|     # assert that we still have correct IOB annotations | ||||
|     doc2 = nlp("hi") | ||||
|     assert doc2.is_nered | ||||
|     for token in doc2: | ||||
|         assert token.ent_iob == 2 | ||||
|  | @ -1,9 +0,0 @@ | |||
| from spacy.lang.el import Greek | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4272(): | ||||
|     """Test that lookup table can be accessed from Token.lemma if no POS tags | ||||
|     are available.""" | ||||
|     nlp = Greek() | ||||
|     doc = nlp("Χθες") | ||||
|     assert doc[0].lemma_ | ||||
|  | @ -1,25 +0,0 @@ | |||
| import pytest | ||||
| from spacy.language import Language | ||||
| from spacy.pipeline import Pipe | ||||
| 
 | ||||
| 
 | ||||
| class DummyPipe(Pipe): | ||||
|     def __init__(self): | ||||
|         self.model = "dummy_model" | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|         return ([1, 2, 3], [4, 5, 6]) | ||||
| 
 | ||||
|     def set_annotations(self, docs, scores, tensors=None): | ||||
|         return docs | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def nlp(): | ||||
|     return Language() | ||||
| 
 | ||||
| 
 | ||||
| def test_multiple_predictions(nlp): | ||||
|     doc = nlp.make_doc("foo") | ||||
|     dummy_pipe = DummyPipe() | ||||
|     dummy_pipe(doc) | ||||
|  | @ -1,47 +0,0 @@ | |||
| from collections import defaultdict | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.pipeline.defaults import default_ner | ||||
| from spacy.pipeline import EntityRecognizer | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
| from spacy.tokens import Span | ||||
| 
 | ||||
| 
 | ||||
| # skipped after removing Beam stuff during the Example/GoldParse refactor | ||||
| @pytest.mark.skip | ||||
| def test_issue4313(): | ||||
|     """ This should not crash or exit with some strange error code """ | ||||
|     beam_width = 16 | ||||
|     beam_density = 0.0001 | ||||
|     nlp = English() | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(nlp.vocab, default_ner(), **config) | ||||
|     ner.add_label("SOME_LABEL") | ||||
|     ner.begin_training([]) | ||||
|     nlp.add_pipe(ner) | ||||
| 
 | ||||
|     # add a new label to the doc | ||||
|     doc = nlp("What do you think about Apple ?") | ||||
|     assert len(ner.labels) == 1 | ||||
|     assert "SOME_LABEL" in ner.labels | ||||
|     apple_ent = Span(doc, 5, 6, label="MY_ORG") | ||||
|     doc.ents = list(doc.ents) + [apple_ent] | ||||
| 
 | ||||
|     # ensure the beam_parse still works with the new label | ||||
|     docs = [doc] | ||||
|     beams = nlp.entity.beam_parse( | ||||
|         docs, beam_width=beam_width, beam_density=beam_density | ||||
|     ) | ||||
| 
 | ||||
|     for doc, beam in zip(docs, beams): | ||||
|         entity_scores = defaultdict(float) | ||||
|         for score, ents in nlp.entity.moves.get_beam_parses(beam): | ||||
|             for start, end, label in ents: | ||||
|                 entity_scores[(start, end, label)] += score | ||||
|  | @ -1,24 +0,0 @@ | |||
| from spacy.gold import Example | ||||
| from spacy.lang.en import English | ||||
| from spacy.util import minibatch | ||||
| from thinc.api import compounding | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4348(): | ||||
|     """Test that training the tagger with empty data, doesn't throw errors""" | ||||
| 
 | ||||
|     nlp = English() | ||||
|     example = Example.from_dict(nlp.make_doc(""), {"tags": []}) | ||||
|     TRAIN_DATA = [example, example] | ||||
| 
 | ||||
|     tagger = nlp.create_pipe("tagger") | ||||
|     nlp.add_pipe(tagger) | ||||
| 
 | ||||
|     optimizer = nlp.begin_training() | ||||
|     for i in range(5): | ||||
|         losses = {} | ||||
|         batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) | ||||
|         for batch in batches: | ||||
|             nlp.update(batch, sgd=optimizer, losses=losses) | ||||
|  | @ -1,8 +0,0 @@ | |||
| from spacy.tokens import DocBin | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4367(): | ||||
|     """Test that docbin init goes well""" | ||||
|     DocBin() | ||||
|     DocBin(attrs=["LEMMA"]) | ||||
|     DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) | ||||
|  | @ -1,10 +0,0 @@ | |||
| from spacy.matcher import Matcher, PhraseMatcher | ||||
| from spacy.vocab import Vocab | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4373(): | ||||
|     """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" | ||||
|     matcher = Matcher(Vocab()) | ||||
|     assert isinstance(matcher.vocab, Vocab) | ||||
|     matcher = PhraseMatcher(Vocab()) | ||||
|     assert isinstance(matcher.vocab, Vocab) | ||||
|  | @ -1,98 +0,0 @@ | |||
| from spacy.gold import Corpus | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| from ...gold.converters import json2docs | ||||
| from ...tokens import DocBin | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4402(): | ||||
|     nlp = English() | ||||
|     attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] | ||||
|     with make_tempdir() as tmpdir: | ||||
|         output_file = tmpdir / "test4402.spacy" | ||||
|         docs = json2docs([json_data]) | ||||
|         data = DocBin(docs=docs, attrs=attrs).to_bytes() | ||||
|         with output_file.open("wb") as file_: | ||||
|             file_.write(data) | ||||
|         corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) | ||||
| 
 | ||||
|         train_data = list(corpus.train_dataset(nlp)) | ||||
|         assert len(train_data) == 2 | ||||
| 
 | ||||
|         split_train_data = [] | ||||
|         for eg in train_data: | ||||
|             split_train_data.extend(eg.split_sents()) | ||||
|         assert len(split_train_data) == 4 | ||||
| 
 | ||||
| 
 | ||||
| json_data = { | ||||
|     "id": 0, | ||||
|     "paragraphs": [ | ||||
|         { | ||||
|             "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", | ||||
|             "sentences": [ | ||||
|                 { | ||||
|                     "tokens": [ | ||||
|                         {"id": 0, "orth": "How", "ner": "O"}, | ||||
|                         {"id": 1, "orth": "should", "ner": "O"}, | ||||
|                         {"id": 2, "orth": "I", "ner": "O"}, | ||||
|                         {"id": 3, "orth": "cook", "ner": "O"}, | ||||
|                         {"id": 4, "orth": "bacon", "ner": "O"}, | ||||
|                         {"id": 5, "orth": "in", "ner": "O"}, | ||||
|                         {"id": 6, "orth": "an", "ner": "O"}, | ||||
|                         {"id": 7, "orth": "oven", "ner": "O"}, | ||||
|                         {"id": 8, "orth": "?", "ner": "O"}, | ||||
|                     ], | ||||
|                     "brackets": [], | ||||
|                 }, | ||||
|                 { | ||||
|                     "tokens": [ | ||||
|                         {"id": 9, "orth": "\n", "ner": "O"}, | ||||
|                         {"id": 10, "orth": "I", "ner": "O"}, | ||||
|                         {"id": 11, "orth": "'ve", "ner": "O"}, | ||||
|                         {"id": 12, "orth": "heard", "ner": "O"}, | ||||
|                         {"id": 13, "orth": "of", "ner": "O"}, | ||||
|                         {"id": 14, "orth": "people", "ner": "O"}, | ||||
|                         {"id": 15, "orth": "cooking", "ner": "O"}, | ||||
|                         {"id": 16, "orth": "bacon", "ner": "O"}, | ||||
|                         {"id": 17, "orth": "in", "ner": "O"}, | ||||
|                         {"id": 18, "orth": "an", "ner": "O"}, | ||||
|                         {"id": 19, "orth": "oven", "ner": "O"}, | ||||
|                         {"id": 20, "orth": ".", "ner": "O"}, | ||||
|                     ], | ||||
|                     "brackets": [], | ||||
|                 }, | ||||
|             ], | ||||
|             "cats": [ | ||||
|                 {"label": "baking", "value": 1.0}, | ||||
|                 {"label": "not_baking", "value": 0.0}, | ||||
|             ], | ||||
|         }, | ||||
|         { | ||||
|             "raw": "What is the difference between white and brown eggs?\n", | ||||
|             "sentences": [ | ||||
|                 { | ||||
|                     "tokens": [ | ||||
|                         {"id": 0, "orth": "What", "ner": "O"}, | ||||
|                         {"id": 1, "orth": "is", "ner": "O"}, | ||||
|                         {"id": 2, "orth": "the", "ner": "O"}, | ||||
|                         {"id": 3, "orth": "difference", "ner": "O"}, | ||||
|                         {"id": 4, "orth": "between", "ner": "O"}, | ||||
|                         {"id": 5, "orth": "white", "ner": "O"}, | ||||
|                         {"id": 6, "orth": "and", "ner": "O"}, | ||||
|                         {"id": 7, "orth": "brown", "ner": "O"}, | ||||
|                         {"id": 8, "orth": "eggs", "ner": "O"}, | ||||
|                         {"id": 9, "orth": "?", "ner": "O"}, | ||||
|                     ], | ||||
|                     "brackets": [], | ||||
|                 }, | ||||
|                 {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, | ||||
|             ], | ||||
|             "cats": [ | ||||
|                 {"label": "baking", "value": 0.0}, | ||||
|                 {"label": "not_baking", "value": 1.0}, | ||||
|             ], | ||||
|         }, | ||||
|     ], | ||||
| } | ||||
							
								
								
									
										288
									
								
								spacy/tests/regression/test_issue4501-5000.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										288
									
								
								spacy/tests/regression/test_issue4501-5000.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,288 @@ | |||
| import pytest | ||||
| from mock import Mock | ||||
| from spacy.pipeline import EntityRuler | ||||
| from spacy.matcher import DependencyMatcher | ||||
| from spacy.tokens import Doc, Span, DocBin | ||||
| from spacy.gold import Example | ||||
| from spacy.gold.converters.conllu2docs import conllu2docs | ||||
| from spacy.lang.en import English | ||||
| from spacy.kb import KnowledgeBase | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.language import Language | ||||
| from spacy.util import ensure_path, load_model_from_path | ||||
| import numpy | ||||
| import pickle | ||||
| 
 | ||||
| from ..util import get_doc, make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4528(en_vocab): | ||||
|     """Test that user_data is correctly serialized in DocBin.""" | ||||
|     doc = Doc(en_vocab, words=["hello", "world"]) | ||||
|     doc.user_data["foo"] = "bar" | ||||
|     # This is how extension attribute values are stored in the user data | ||||
|     doc.user_data[("._.", "foo", None, None)] = "bar" | ||||
|     doc_bin = DocBin(store_user_data=True) | ||||
|     doc_bin.add(doc) | ||||
|     doc_bin_bytes = doc_bin.to_bytes() | ||||
|     new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) | ||||
|     new_doc = list(new_doc_bin.get_docs(en_vocab))[0] | ||||
|     assert new_doc.user_data["foo"] == "bar" | ||||
|     assert new_doc.user_data[("._.", "foo", None, None)] == "bar" | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] | ||||
| ) | ||||
| def test_gold_misaligned(en_tokenizer, text, words): | ||||
|     doc = en_tokenizer(text) | ||||
|     Example.from_dict(doc, {"words": words}) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4590(en_vocab): | ||||
|     """Test that matches param in on_match method are the same as matches run with no on_match method""" | ||||
|     pattern = [ | ||||
|         {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, | ||||
|         { | ||||
|             "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, | ||||
|             "PATTERN": {"ORTH": "fox"}, | ||||
|         }, | ||||
|         { | ||||
|             "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, | ||||
|             "PATTERN": {"ORTH": "fox"}, | ||||
|         }, | ||||
|     ] | ||||
| 
 | ||||
|     on_match = Mock() | ||||
|     matcher = DependencyMatcher(en_vocab) | ||||
|     matcher.add("pattern", on_match, pattern) | ||||
|     text = "The quick brown fox jumped over the lazy fox" | ||||
|     heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] | ||||
|     deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] | ||||
|     doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) | ||||
|     matches = matcher(doc) | ||||
|     on_match_args = on_match.call_args | ||||
|     assert on_match_args[0][3] == matches | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4651_with_phrase_matcher_attr(): | ||||
|     """Test that the EntityRuler PhraseMatcher is deserialize correctly using | ||||
|     the method from_disk when the EntityRuler argument phrase_matcher_attr is | ||||
|     specified. | ||||
|     """ | ||||
|     text = "Spacy is a python library for nlp" | ||||
|     nlp = English() | ||||
|     ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") | ||||
|     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
|     doc = nlp(text) | ||||
|     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] | ||||
|     nlp_reloaded = English() | ||||
|     with make_tempdir() as d: | ||||
|         file_path = d / "entityruler" | ||||
|         ruler.to_disk(file_path) | ||||
|         ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) | ||||
|     nlp_reloaded.add_pipe(ruler_reloaded) | ||||
|     doc_reloaded = nlp_reloaded(text) | ||||
|     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] | ||||
|     assert res == res_reloaded | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4651_without_phrase_matcher_attr(): | ||||
|     """Test that the EntityRuler PhraseMatcher is deserialize correctly using | ||||
|     the method from_disk when the EntityRuler argument phrase_matcher_attr is | ||||
|     not specified. | ||||
|     """ | ||||
|     text = "Spacy is a python library for nlp" | ||||
|     nlp = English() | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
|     doc = nlp(text) | ||||
|     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] | ||||
|     nlp_reloaded = English() | ||||
|     with make_tempdir() as d: | ||||
|         file_path = d / "entityruler" | ||||
|         ruler.to_disk(file_path) | ||||
|         ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) | ||||
|     nlp_reloaded.add_pipe(ruler_reloaded) | ||||
|     doc_reloaded = nlp_reloaded(text) | ||||
|     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] | ||||
|     assert res == res_reloaded | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4665(): | ||||
|     """ | ||||
|     conllu2json should not raise an exception if the HEAD column contains an | ||||
|     underscore | ||||
|     """ | ||||
|     input_data = """ | ||||
| 1	[	_	PUNCT	-LRB-	_	_	punct	_	_ | ||||
| 2	This	_	DET	DT	_	_	det	_	_ | ||||
| 3	killing	_	NOUN	NN	_	_	nsubj	_	_ | ||||
| 4	of	_	ADP	IN	_	_	case	_	_ | ||||
| 5	a	_	DET	DT	_	_	det	_	_ | ||||
| 6	respected	_	ADJ	JJ	_	_	amod	_	_ | ||||
| 7	cleric	_	NOUN	NN	_	_	nmod	_	_ | ||||
| 8	will	_	AUX	MD	_	_	aux	_	_ | ||||
| 9	be	_	AUX	VB	_	_	aux	_	_ | ||||
| 10	causing	_	VERB	VBG	_	_	root	_	_ | ||||
| 11	us	_	PRON	PRP	_	_	iobj	_	_ | ||||
| 12	trouble	_	NOUN	NN	_	_	dobj	_	_ | ||||
| 13	for	_	ADP	IN	_	_	case	_	_ | ||||
| 14	years	_	NOUN	NNS	_	_	nmod	_	_ | ||||
| 15	to	_	PART	TO	_	_	mark	_	_ | ||||
| 16	come	_	VERB	VB	_	_	acl	_	_ | ||||
| 17	.	_	PUNCT	.	_	_	punct	_	_ | ||||
| 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | ||||
| """ | ||||
|     conllu2docs(input_data) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4674(): | ||||
|     """Test that setting entities with overlapping identifiers does not mess up IO""" | ||||
|     nlp = English() | ||||
|     kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) | ||||
|     vector1 = [0.9, 1.1, 1.01] | ||||
|     vector2 = [1.8, 2.25, 2.01] | ||||
|     with pytest.warns(UserWarning): | ||||
|         kb.set_entities( | ||||
|             entity_list=["Q1", "Q1"], | ||||
|             freq_list=[32, 111], | ||||
|             vector_list=[vector1, vector2], | ||||
|         ) | ||||
|     assert kb.get_size_entities() == 1 | ||||
|     # dumping to file & loading back in | ||||
|     with make_tempdir() as d: | ||||
|         dir_path = ensure_path(d) | ||||
|         if not dir_path.exists(): | ||||
|             dir_path.mkdir() | ||||
|         file_path = dir_path / "kb" | ||||
|         kb.dump(str(file_path)) | ||||
|         kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) | ||||
|         kb2.load_bulk(str(file_path)) | ||||
|     assert kb2.get_size_entities() == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4707(): | ||||
|     """Tests that disabled component names are also excluded from nlp.from_disk | ||||
|     by default when loading a model. | ||||
|     """ | ||||
|     nlp = English() | ||||
|     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||
|     nlp.add_pipe(nlp.create_pipe("entity_ruler")) | ||||
|     assert nlp.pipe_names == ["sentencizer", "entity_ruler"] | ||||
|     exclude = ["tokenizer", "sentencizer"] | ||||
|     with make_tempdir() as tmpdir: | ||||
|         nlp.to_disk(tmpdir, exclude=exclude) | ||||
|         new_nlp = load_model_from_path(tmpdir, disable=exclude) | ||||
|     assert "sentencizer" not in new_nlp.pipe_names | ||||
|     assert "entity_ruler" in new_nlp.pipe_names | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4725_1(): | ||||
|     """ Ensure the pickling of the NER goes well""" | ||||
|     vocab = Vocab(vectors_name="test_vocab_add_vector") | ||||
|     nlp = English(vocab=vocab) | ||||
|     ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) | ||||
|     with make_tempdir() as tmp_path: | ||||
|         with (tmp_path / "ner.pkl").open("wb") as file_: | ||||
|             pickle.dump(ner, file_) | ||||
|             assert ner.cfg["min_action_freq"] == 342 | ||||
| 
 | ||||
|         with (tmp_path / "ner.pkl").open("rb") as file_: | ||||
|             ner2 = pickle.load(file_) | ||||
|             assert ner2.cfg["min_action_freq"] == 342 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue4725_2(): | ||||
|     # ensures that this runs correctly and doesn't hang or crash because of the global vectors | ||||
|     # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) | ||||
|     vocab = Vocab(vectors_name="test_vocab_add_vector") | ||||
|     data = numpy.ndarray((5, 3), dtype="f") | ||||
|     data[0] = 1.0 | ||||
|     data[1] = 2.0 | ||||
|     vocab.set_vector("cat", data[0]) | ||||
|     vocab.set_vector("dog", data[1]) | ||||
|     nlp = English(vocab=vocab) | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     nlp.add_pipe(ner) | ||||
|     nlp.begin_training() | ||||
|     docs = ["Kurt is in London."] * 10 | ||||
|     for _ in nlp.pipe(docs, batch_size=2, n_process=2): | ||||
|         pass | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4849(): | ||||
|     nlp = English() | ||||
|     ruler = EntityRuler( | ||||
|         nlp, | ||||
|         patterns=[ | ||||
|             {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, | ||||
|             {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, | ||||
|         ], | ||||
|         phrase_matcher_attr="LOWER", | ||||
|     ) | ||||
|     nlp.add_pipe(ruler) | ||||
|     text = """ | ||||
|     The left is starting to take aim at Democratic front-runner Joe Biden. | ||||
|     Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." | ||||
|     """ | ||||
|     # USING 1 PROCESS | ||||
|     count_ents = 0 | ||||
|     for doc in nlp.pipe([text], n_process=1): | ||||
|         count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) | ||||
|     assert count_ents == 2 | ||||
|     # USING 2 PROCESSES | ||||
|     count_ents = 0 | ||||
|     for doc in nlp.pipe([text], n_process=2): | ||||
|         count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) | ||||
|     assert count_ents == 2 | ||||
| 
 | ||||
| 
 | ||||
| class CustomPipe: | ||||
|     name = "my_pipe" | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         Span.set_extension("my_ext", getter=self._get_my_ext) | ||||
|         Doc.set_extension("my_ext", default=None) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         gathered_ext = [] | ||||
|         for sent in doc.sents: | ||||
|             sent_ext = self._get_my_ext(sent) | ||||
|             sent._.set("my_ext", sent_ext) | ||||
|             gathered_ext.append(sent_ext) | ||||
| 
 | ||||
|         doc._.set("my_ext", "\n".join(gathered_ext)) | ||||
| 
 | ||||
|         return doc | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _get_my_ext(span): | ||||
|         return str(span.end) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4903(): | ||||
|     """Ensure that this runs correctly and doesn't hang or crash on Windows / | ||||
|     macOS.""" | ||||
|     nlp = English() | ||||
|     custom_component = CustomPipe() | ||||
|     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||
|     nlp.add_pipe(custom_component, after="sentencizer") | ||||
| 
 | ||||
|     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] | ||||
|     docs = list(nlp.pipe(text, n_process=2)) | ||||
|     assert docs[0].text == "I like bananas." | ||||
|     assert docs[1].text == "Do you like them?" | ||||
|     assert docs[2].text == "No, I prefer wasabi." | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4924(): | ||||
|     nlp = Language() | ||||
|     example = Example.from_dict(nlp.make_doc(""), {}) | ||||
|     nlp.evaluate([example]) | ||||
|  | @ -1,16 +0,0 @@ | |||
| from spacy.tokens import Doc, DocBin | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4528(en_vocab): | ||||
|     """Test that user_data is correctly serialized in DocBin.""" | ||||
|     doc = Doc(en_vocab, words=["hello", "world"]) | ||||
|     doc.user_data["foo"] = "bar" | ||||
|     # This is how extension attribute values are stored in the user data | ||||
|     doc.user_data[("._.", "foo", None, None)] = "bar" | ||||
|     doc_bin = DocBin(store_user_data=True) | ||||
|     doc_bin.add(doc) | ||||
|     doc_bin_bytes = doc_bin.to_bytes() | ||||
|     new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) | ||||
|     new_doc = list(new_doc_bin.get_docs(en_vocab))[0] | ||||
|     assert new_doc.user_data["foo"] == "bar" | ||||
|     assert new_doc.user_data[("._.", "foo", None, None)] == "bar" | ||||
|  | @ -1,11 +0,0 @@ | |||
| import pytest | ||||
| 
 | ||||
| from spacy.gold import Example | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] | ||||
| ) | ||||
| def test_gold_misaligned(en_tokenizer, text, words): | ||||
|     doc = en_tokenizer(text) | ||||
|     Example.from_dict(doc, {"words": words}) | ||||
|  | @ -1,35 +0,0 @@ | |||
| from mock import Mock | ||||
| from spacy.matcher import DependencyMatcher | ||||
| from ..util import get_doc | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4590(en_vocab): | ||||
|     """Test that matches param in on_match method are the same as matches run with no on_match method""" | ||||
|     pattern = [ | ||||
|         {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, | ||||
|         { | ||||
|             "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, | ||||
|             "PATTERN": {"ORTH": "fox"}, | ||||
|         }, | ||||
|         { | ||||
|             "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, | ||||
|             "PATTERN": {"ORTH": "fox"}, | ||||
|         }, | ||||
|     ] | ||||
| 
 | ||||
|     on_match = Mock() | ||||
| 
 | ||||
|     matcher = DependencyMatcher(en_vocab) | ||||
|     matcher.add("pattern", on_match, pattern) | ||||
| 
 | ||||
|     text = "The quick brown fox jumped over the lazy fox" | ||||
|     heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] | ||||
|     deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] | ||||
| 
 | ||||
|     doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) | ||||
| 
 | ||||
|     matches = matcher(doc) | ||||
| 
 | ||||
|     on_match_args = on_match.call_args | ||||
| 
 | ||||
|     assert on_match_args[0][3] == matches | ||||
|  | @ -1,62 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.pipeline import EntityRuler | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4651_with_phrase_matcher_attr(): | ||||
|     """Test that the EntityRuler PhraseMatcher is deserialize correctly using | ||||
|     the method from_disk when the EntityRuler argument phrase_matcher_attr is | ||||
|     specified. | ||||
|     """ | ||||
|     text = "Spacy is a python library for nlp" | ||||
| 
 | ||||
|     nlp = English() | ||||
|     ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") | ||||
|     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
| 
 | ||||
|     doc = nlp(text) | ||||
|     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] | ||||
| 
 | ||||
|     nlp_reloaded = English() | ||||
|     with make_tempdir() as d: | ||||
|         file_path = d / "entityruler" | ||||
|         ruler.to_disk(file_path) | ||||
|         ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) | ||||
| 
 | ||||
|     nlp_reloaded.add_pipe(ruler_reloaded) | ||||
|     doc_reloaded = nlp_reloaded(text) | ||||
|     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] | ||||
| 
 | ||||
|     assert res == res_reloaded | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4651_without_phrase_matcher_attr(): | ||||
|     """Test that the EntityRuler PhraseMatcher is deserialize correctly using | ||||
|     the method from_disk when the EntityRuler argument phrase_matcher_attr is | ||||
|     not specified. | ||||
|     """ | ||||
|     text = "Spacy is a python library for nlp" | ||||
| 
 | ||||
|     nlp = English() | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
| 
 | ||||
|     doc = nlp(text) | ||||
|     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] | ||||
| 
 | ||||
|     nlp_reloaded = English() | ||||
|     with make_tempdir() as d: | ||||
|         file_path = d / "entityruler" | ||||
|         ruler.to_disk(file_path) | ||||
|         ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) | ||||
| 
 | ||||
|     nlp_reloaded.add_pipe(ruler_reloaded) | ||||
|     doc_reloaded = nlp_reloaded(text) | ||||
|     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] | ||||
| 
 | ||||
|     assert res == res_reloaded | ||||
|  | @ -1,35 +0,0 @@ | |||
| import pytest | ||||
| 
 | ||||
| # TODO | ||||
| # from spacy.gold.converters.conllu2docs import conllu2docs | ||||
| 
 | ||||
| input_data = """ | ||||
| 1	[	_	PUNCT	-LRB-	_	_	punct	_	_ | ||||
| 2	This	_	DET	DT	_	_	det	_	_ | ||||
| 3	killing	_	NOUN	NN	_	_	nsubj	_	_ | ||||
| 4	of	_	ADP	IN	_	_	case	_	_ | ||||
| 5	a	_	DET	DT	_	_	det	_	_ | ||||
| 6	respected	_	ADJ	JJ	_	_	amod	_	_ | ||||
| 7	cleric	_	NOUN	NN	_	_	nmod	_	_ | ||||
| 8	will	_	AUX	MD	_	_	aux	_	_ | ||||
| 9	be	_	AUX	VB	_	_	aux	_	_ | ||||
| 10	causing	_	VERB	VBG	_	_	root	_	_ | ||||
| 11	us	_	PRON	PRP	_	_	iobj	_	_ | ||||
| 12	trouble	_	NOUN	NN	_	_	dobj	_	_ | ||||
| 13	for	_	ADP	IN	_	_	case	_	_ | ||||
| 14	years	_	NOUN	NNS	_	_	nmod	_	_ | ||||
| 15	to	_	PART	TO	_	_	mark	_	_ | ||||
| 16	come	_	VERB	VB	_	_	acl	_	_ | ||||
| 17	.	_	PUNCT	.	_	_	punct	_	_ | ||||
| 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_issue4665(): | ||||
|     """ | ||||
|     conllu2json should not raise an exception if the HEAD column contains an | ||||
|     underscore | ||||
|     """ | ||||
|     pass | ||||
|     # conllu2json(input_data) | ||||
|  | @ -1,36 +0,0 @@ | |||
| import pytest | ||||
| from spacy.kb import KnowledgeBase | ||||
| from spacy.util import ensure_path | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4674(): | ||||
|     """Test that setting entities with overlapping identifiers does not mess up IO""" | ||||
|     nlp = English() | ||||
|     kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) | ||||
| 
 | ||||
|     vector1 = [0.9, 1.1, 1.01] | ||||
|     vector2 = [1.8, 2.25, 2.01] | ||||
|     with pytest.warns(UserWarning): | ||||
|         kb.set_entities( | ||||
|             entity_list=["Q1", "Q1"], | ||||
|             freq_list=[32, 111], | ||||
|             vector_list=[vector1, vector2], | ||||
|         ) | ||||
| 
 | ||||
|     assert kb.get_size_entities() == 1 | ||||
| 
 | ||||
|     # dumping to file & loading back in | ||||
|     with make_tempdir() as d: | ||||
|         dir_path = ensure_path(d) | ||||
|         if not dir_path.exists(): | ||||
|             dir_path.mkdir() | ||||
|         file_path = dir_path / "kb" | ||||
|         kb.dump(str(file_path)) | ||||
| 
 | ||||
|         kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) | ||||
|         kb2.load_bulk(str(file_path)) | ||||
| 
 | ||||
|     assert kb2.get_size_entities() == 1 | ||||
|  | @ -1,20 +0,0 @@ | |||
| from spacy.util import load_model_from_path | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4707(): | ||||
|     """Tests that disabled component names are also excluded from nlp.from_disk | ||||
|     by default when loading a model. | ||||
|     """ | ||||
|     nlp = English() | ||||
|     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||
|     nlp.add_pipe(nlp.create_pipe("entity_ruler")) | ||||
|     assert nlp.pipe_names == ["sentencizer", "entity_ruler"] | ||||
|     exclude = ["tokenizer", "sentencizer"] | ||||
|     with make_tempdir() as tmpdir: | ||||
|         nlp.to_disk(tmpdir, exclude=exclude) | ||||
|         new_nlp = load_model_from_path(tmpdir, disable=exclude) | ||||
|     assert "sentencizer" not in new_nlp.pipe_names | ||||
|     assert "entity_ruler" in new_nlp.pipe_names | ||||
|  | @ -1,41 +0,0 @@ | |||
| import pickle | ||||
| import numpy | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
| from spacy.vocab import Vocab | ||||
| 
 | ||||
| from spacy.tests.util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_pickle_ner(): | ||||
|     """ Ensure the pickling of the NER goes well""" | ||||
|     vocab = Vocab(vectors_name="test_vocab_add_vector") | ||||
|     nlp = English(vocab=vocab) | ||||
|     ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) | ||||
|     with make_tempdir() as tmp_path: | ||||
|         with (tmp_path / "ner.pkl").open("wb") as file_: | ||||
|             pickle.dump(ner, file_) | ||||
|             assert ner.cfg["min_action_freq"] == 342 | ||||
| 
 | ||||
|         with (tmp_path / "ner.pkl").open("rb") as file_: | ||||
|             ner2 = pickle.load(file_) | ||||
|             assert ner2.cfg["min_action_freq"] == 342 | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4725(): | ||||
|     # ensures that this runs correctly and doesn't hang or crash because of the global vectors | ||||
|     # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) | ||||
|     vocab = Vocab(vectors_name="test_vocab_add_vector") | ||||
|     data = numpy.ndarray((5, 3), dtype="f") | ||||
|     data[0] = 1.0 | ||||
|     data[1] = 2.0 | ||||
|     vocab.set_vector("cat", data[0]) | ||||
|     vocab.set_vector("dog", data[1]) | ||||
| 
 | ||||
|     nlp = English(vocab=vocab) | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     nlp.add_pipe(ner) | ||||
|     nlp.begin_training() | ||||
|     docs = ["Kurt is in London."] * 10 | ||||
|     for _ in nlp.pipe(docs, batch_size=2, n_process=2): | ||||
|         pass | ||||
|  | @ -1,34 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.pipeline import EntityRuler | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4849(): | ||||
|     nlp = English() | ||||
| 
 | ||||
|     ruler = EntityRuler( | ||||
|         nlp, | ||||
|         patterns=[ | ||||
|             {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, | ||||
|             {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, | ||||
|         ], | ||||
|         phrase_matcher_attr="LOWER", | ||||
|     ) | ||||
| 
 | ||||
|     nlp.add_pipe(ruler) | ||||
| 
 | ||||
|     text = """ | ||||
|     The left is starting to take aim at Democratic front-runner Joe Biden. | ||||
|     Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." | ||||
|     """ | ||||
| 
 | ||||
|     # USING 1 PROCESS | ||||
|     count_ents = 0 | ||||
|     for doc in nlp.pipe([text], n_process=1): | ||||
|         count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) | ||||
|     assert count_ents == 2 | ||||
| 
 | ||||
|     # USING 2 PROCESSES | ||||
|     count_ents = 0 | ||||
|     for doc in nlp.pipe([text], n_process=2): | ||||
|         count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) | ||||
|     assert count_ents == 2 | ||||
|  | @ -1,40 +0,0 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.tokens import Span, Doc | ||||
| 
 | ||||
| 
 | ||||
| class CustomPipe: | ||||
|     name = "my_pipe" | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         Span.set_extension("my_ext", getter=self._get_my_ext) | ||||
|         Doc.set_extension("my_ext", default=None) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         gathered_ext = [] | ||||
|         for sent in doc.sents: | ||||
|             sent_ext = self._get_my_ext(sent) | ||||
|             sent._.set("my_ext", sent_ext) | ||||
|             gathered_ext.append(sent_ext) | ||||
| 
 | ||||
|         doc._.set("my_ext", "\n".join(gathered_ext)) | ||||
| 
 | ||||
|         return doc | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _get_my_ext(span): | ||||
|         return str(span.end) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4903(): | ||||
|     # ensures that this runs correctly and doesn't hang or crash on Windows / macOS | ||||
| 
 | ||||
|     nlp = English() | ||||
|     custom_component = CustomPipe() | ||||
|     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||
|     nlp.add_pipe(custom_component, after="sentencizer") | ||||
| 
 | ||||
|     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] | ||||
|     docs = list(nlp.pipe(text, n_process=2)) | ||||
|     assert docs[0].text == "I like bananas." | ||||
|     assert docs[1].text == "Do you like them?" | ||||
|     assert docs[2].text == "No, I prefer wasabi." | ||||
|  | @ -1,8 +0,0 @@ | |||
| from spacy.gold import Example | ||||
| from spacy.language import Language | ||||
| 
 | ||||
| 
 | ||||
| def test_issue4924(): | ||||
|     nlp = Language() | ||||
|     example = Example.from_dict(nlp.make_doc(""), {}) | ||||
|     nlp.evaluate([example]) | ||||
|  | @ -1,6 +1,8 @@ | |||
| import pytest | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | ||||
| def test_issue5152(): | ||||
|     # Test that the comparison between a Span and a Token, goes well | ||||
|     # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) | ||||
|  | @ -8,7 +10,6 @@ def test_issue5152(): | |||
|     text = nlp("Talk about being boring!") | ||||
|     text_var = nlp("Talk of being boring!") | ||||
|     y = nlp("Let") | ||||
| 
 | ||||
|     span = text[0:3]  # Talk about being | ||||
|     span_2 = text[0:3]  # Talk about being | ||||
|     span_3 = text_var[0:3]  # Talk of being | ||||
|  |  | |||
|  | @ -63,7 +63,8 @@ def tagger(): | |||
|     # need to add model for two reasons: | ||||
|     # 1. no model leads to error in serialization, | ||||
|     # 2. the affected line is the one for model serialization | ||||
|     tagger.begin_training(pipeline=nlp.pipeline) | ||||
|     with pytest.warns(UserWarning): | ||||
|         tagger.begin_training(pipeline=nlp.pipeline) | ||||
|     return tagger | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										31
									
								
								spacy/tests/regression/test_issue5551.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								spacy/tests/regression/test_issue5551.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,31 @@ | |||
| from spacy.lang.en import English | ||||
| from spacy.util import fix_random_seed | ||||
| 
 | ||||
| 
 | ||||
| def test_issue5551(): | ||||
|     """Test that after fixing the random seed, the results of the pipeline are truly identical""" | ||||
|     component = "textcat" | ||||
|     pipe_cfg = {"exclusive_classes": False} | ||||
| 
 | ||||
|     results = [] | ||||
|     for i in range(3): | ||||
|         fix_random_seed(0) | ||||
|         nlp = English() | ||||
|         example = ( | ||||
|             "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.", | ||||
|             {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}, | ||||
|         ) | ||||
|         nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True) | ||||
|         pipe = nlp.get_pipe(component) | ||||
|         for label in set(example[1]["cats"]): | ||||
|             pipe.add_label(label) | ||||
|         nlp.begin_training(component_cfg={component: pipe_cfg}) | ||||
| 
 | ||||
|         # Store the result of each iteration | ||||
|         result = pipe.model.predict([nlp.make_doc(example[0])]) | ||||
|         results.append(list(result[0])) | ||||
| 
 | ||||
|     # All results should be the same because of the fixed seed | ||||
|     assert len(results) == 3 | ||||
|     assert results[0] == results[1] | ||||
|     assert results[0] == results[2] | ||||
|  | @ -1,3 +1,4 @@ | |||
| import numpy | ||||
| from spacy.errors import AlignmentError | ||||
| from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags | ||||
| from spacy.gold import spans_from_biluo_tags, iob_to_biluo | ||||
|  | @ -5,6 +6,7 @@ from spacy.gold import Corpus, docs_to_json | |||
| from spacy.gold.example import Example | ||||
| from spacy.gold.converters import json2docs | ||||
| from spacy.lang.en import English | ||||
| from spacy.pipeline import EntityRuler | ||||
| from spacy.tokens import Doc, DocBin | ||||
| from spacy.util import get_words_and_spaces, minibatch | ||||
| from thinc.api import compounding | ||||
|  | @ -153,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab): | |||
|     assert tags == ["O", "O", "O", "-", "-", "-"] | ||||
| 
 | ||||
| 
 | ||||
| def test_example_constructor(en_vocab): | ||||
|     words = ["I", "like", "stuff"] | ||||
|     tags = ["NOUN", "VERB", "NOUN"] | ||||
|     tag_ids = [en_vocab.strings.add(tag) for tag in tags] | ||||
|     predicted = Doc(en_vocab, words=words) | ||||
|     reference = Doc(en_vocab, words=words) | ||||
|     reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) | ||||
|     example = Example(predicted, reference) | ||||
|     tags = example.get_aligned("TAG", as_string=True) | ||||
|     assert tags == ["NOUN", "VERB", "NOUN"] | ||||
| 
 | ||||
| 
 | ||||
| def test_example_from_dict_tags(en_vocab): | ||||
|     words = ["I", "like", "stuff"] | ||||
|     tags = ["NOUN", "VERB", "NOUN"] | ||||
|     predicted = Doc(en_vocab, words=words) | ||||
|     example = Example.from_dict(predicted, {"TAGS": tags}) | ||||
|     tags = example.get_aligned("TAG", as_string=True) | ||||
|     assert tags == ["NOUN", "VERB", "NOUN"] | ||||
| 
 | ||||
| 
 | ||||
| def test_example_from_dict_no_ner(en_vocab): | ||||
|     words = ["a", "b", "c", "d"] | ||||
|     spaces = [True, True, False, True] | ||||
|  | @ -272,72 +295,72 @@ def test_split_sentences(en_vocab): | |||
| 
 | ||||
| 
 | ||||
| def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): | ||||
|     words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] | ||||
|     words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."] | ||||
|     spaces = [True, True, True, False, False] | ||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||
|     prefix = "Mr. and Mrs. Smith flew to " | ||||
|     prefix = "Mr and Mrs Smith flew to " | ||||
|     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] | ||||
|     gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["O", "O", "O", "U-LOC", "O"] | ||||
| 
 | ||||
|     entities = [ | ||||
|         (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON | ||||
|         (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON | ||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||
|     ] | ||||
|     gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] | ||||
| 
 | ||||
|     entities = [ | ||||
|         (len("Mr. and "), len("Mr. and Mrs."), "PERSON"),  # "Mrs." is a Person | ||||
|         (len("Mr and "), len("Mr and Mrs"), "PERSON"),  # "Mrs" is a Person | ||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||
|     ] | ||||
|     gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["O", None, "O", "U-LOC", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): | ||||
|     words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     spaces = [True, True, True, True, True, True, True, False, False] | ||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||
|     prefix = "Mr. and Mrs. Smith flew to " | ||||
|     prefix = "Mr and Mrs Smith flew to " | ||||
|     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] | ||||
|     gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] | ||||
|     gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] | ||||
| 
 | ||||
|     entities = [ | ||||
|         (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON | ||||
|         (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON | ||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||
|     ] | ||||
|     gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] | ||||
|     gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_gold_biluo_misaligned(en_vocab, en_tokenizer): | ||||
|     words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] | ||||
|     words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."] | ||||
|     spaces = [True, True, True, True, True, False, False] | ||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||
|     prefix = "Mr. and Mrs. Smith flew to " | ||||
|     prefix = "Mr and Mrs Smith flew to " | ||||
|     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] | ||||
|     gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] | ||||
|     gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] | ||||
| 
 | ||||
|     entities = [ | ||||
|         (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON | ||||
|         (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON | ||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||
|     ] | ||||
|     gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] | ||||
|     gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] | ||||
|  | @ -407,6 +430,49 @@ def test_biluo_spans(en_tokenizer): | |||
|     assert spans[1].label_ == "GPE" | ||||
| 
 | ||||
| 
 | ||||
| def test_aligned_spans_y2x(en_vocab, en_tokenizer): | ||||
|     words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."] | ||||
|     spaces = [True, True, True, False, False] | ||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||
|     prefix = "Mr and Mrs Smith flew to " | ||||
|     entities = [ | ||||
|         (0, len("Mr and Mrs Smith"), "PERSON"), | ||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||
|     ] | ||||
|     tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||
|     example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) | ||||
|     ents_ref = example.reference.ents | ||||
|     assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)] | ||||
|     ents_y2x = example.get_aligned_spans_y2x(ents_ref) | ||||
|     assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)] | ||||
| 
 | ||||
| 
 | ||||
| def test_aligned_spans_x2y(en_vocab, en_tokenizer): | ||||
|     text = "Mr and Mrs Smith flew to San Francisco Valley" | ||||
|     nlp = English() | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"}, | ||||
|                 {"label": "LOC", "pattern": "San Francisco Valley"}] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
|     doc = nlp(text) | ||||
|     assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)] | ||||
|     prefix = "Mr and Mrs Smith flew to " | ||||
|     entities = [ | ||||
|         (0, len("Mr and Mrs Smith"), "PERSON"), | ||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||
|     ] | ||||
|     tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"] | ||||
|     example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) | ||||
|     assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)] | ||||
| 
 | ||||
|     # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct | ||||
|     ents_pred = example.predicted.ents | ||||
|     assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)] | ||||
|     ents_x2y = example.get_aligned_spans_x2y(ents_pred) | ||||
|     assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)] | ||||
| 
 | ||||
| 
 | ||||
| def test_gold_ner_missing_tags(en_tokenizer): | ||||
|     doc = en_tokenizer("I flew to Silicon Valley via London.") | ||||
|     biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] | ||||
|  | @ -414,6 +480,16 @@ def test_gold_ner_missing_tags(en_tokenizer): | |||
|     assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] | ||||
| 
 | ||||
| 
 | ||||
| def test_projectivize(en_tokenizer): | ||||
|     doc = en_tokenizer("He pretty quickly walks away") | ||||
|     heads = [3, 2, 3, 0, 2] | ||||
|     example = Example.from_dict(doc, {"heads": heads}) | ||||
|     proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) | ||||
|     nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) | ||||
|     assert proj_heads == [3, 2, 3, 0, 3] | ||||
|     assert nonproj_heads == [3, 2, 3, 0, 2] | ||||
| 
 | ||||
| 
 | ||||
| def test_iob_to_biluo(): | ||||
|     good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] | ||||
|     good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] | ||||
|  |  | |||
							
								
								
									
										156
									
								
								spacy/tests/test_models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										156
									
								
								spacy/tests/test_models.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,156 @@ | |||
| from typing import List | ||||
| 
 | ||||
| import pytest | ||||
| from thinc.api import fix_random_seed, Adam, set_dropout_rate | ||||
| from numpy.testing import assert_array_equal | ||||
| import numpy | ||||
| 
 | ||||
| from spacy.ml.models import build_Tok2Vec_model | ||||
| from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.en.examples import sentences as EN_SENTENCES | ||||
| 
 | ||||
| 
 | ||||
| def get_all_params(model): | ||||
|     params = [] | ||||
|     for node in model.walk(): | ||||
|         for name in node.param_names: | ||||
|             params.append(node.get_param(name).ravel()) | ||||
|     return node.ops.xp.concatenate(params) | ||||
| 
 | ||||
| 
 | ||||
| def get_docs(): | ||||
|     nlp = English() | ||||
|     return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)])) | ||||
| 
 | ||||
| 
 | ||||
| def get_gradient(model, Y): | ||||
|     if isinstance(Y, model.ops.xp.ndarray): | ||||
|         dY = model.ops.alloc(Y.shape, dtype=Y.dtype) | ||||
|         dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape) | ||||
|         return dY | ||||
|     elif isinstance(Y, List): | ||||
|         return [get_gradient(model, y) for y in Y] | ||||
|     else: | ||||
|         raise ValueError(f"Could not get gradient for type {type(Y)}") | ||||
| 
 | ||||
| 
 | ||||
| def default_tok2vec(): | ||||
|     return build_Tok2Vec_model(**TOK2VEC_KWARGS) | ||||
| 
 | ||||
| 
 | ||||
| TOK2VEC_KWARGS = { | ||||
|     "width": 96, | ||||
|     "embed_size": 2000, | ||||
|     "subword_features": True, | ||||
|     "char_embed": False, | ||||
|     "conv_depth": 4, | ||||
|     "bilstm_depth": 0, | ||||
|     "maxout_pieces": 4, | ||||
|     "window_size": 1, | ||||
|     "dropout": 0.1, | ||||
|     "nM": 0, | ||||
|     "nC": 0, | ||||
|     "pretrained_vectors": None, | ||||
| } | ||||
| 
 | ||||
| TEXTCAT_KWARGS = { | ||||
|     "width": 64, | ||||
|     "embed_size": 2000, | ||||
|     "pretrained_vectors": None, | ||||
|     "exclusive_classes": False, | ||||
|     "ngram_size": 1, | ||||
|     "window_size": 1, | ||||
|     "conv_depth": 2, | ||||
|     "dropout": None, | ||||
|     "nO": 7 | ||||
| } | ||||
| 
 | ||||
| TEXTCAT_CNN_KWARGS = { | ||||
|     "tok2vec": default_tok2vec(), | ||||
|     "exclusive_classes": False, | ||||
|     "nO": 13, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "seed,model_func,kwargs", | ||||
|     [ | ||||
|         (0, build_Tok2Vec_model, TOK2VEC_KWARGS), | ||||
|         (0, build_text_classifier, TEXTCAT_KWARGS), | ||||
|         (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS), | ||||
|     ], | ||||
| ) | ||||
| def test_models_initialize_consistently(seed, model_func, kwargs): | ||||
|     fix_random_seed(seed) | ||||
|     model1 = model_func(**kwargs) | ||||
|     model1.initialize() | ||||
|     fix_random_seed(seed) | ||||
|     model2 = model_func(**kwargs) | ||||
|     model2.initialize() | ||||
|     params1 = get_all_params(model1) | ||||
|     params2 = get_all_params(model2) | ||||
|     assert_array_equal(params1, params2) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "seed,model_func,kwargs,get_X", | ||||
|     [ | ||||
|         (0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), | ||||
|         (0, build_text_classifier, TEXTCAT_KWARGS, get_docs), | ||||
|         (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), | ||||
|     ], | ||||
| ) | ||||
| def test_models_predict_consistently(seed, model_func, kwargs, get_X): | ||||
|     fix_random_seed(seed) | ||||
|     model1 = model_func(**kwargs).initialize() | ||||
|     Y1 = model1.predict(get_X()) | ||||
|     fix_random_seed(seed) | ||||
|     model2 = model_func(**kwargs).initialize() | ||||
|     Y2 = model2.predict(get_X()) | ||||
| 
 | ||||
|     if model1.has_ref("tok2vec"): | ||||
|         tok2vec1 = model1.get_ref("tok2vec").predict(get_X()) | ||||
|         tok2vec2 = model2.get_ref("tok2vec").predict(get_X()) | ||||
|         for i in range(len(tok2vec1)): | ||||
|             for j in range(len(tok2vec1[i])): | ||||
|                 assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j])) | ||||
| 
 | ||||
|     if isinstance(Y1, numpy.ndarray): | ||||
|         assert_array_equal(Y1, Y2) | ||||
|     elif isinstance(Y1, List): | ||||
|         assert len(Y1) == len(Y2) | ||||
|         for y1, y2 in zip(Y1, Y2): | ||||
|             assert_array_equal(y1, y2) | ||||
|     else: | ||||
|         raise ValueError(f"Could not compare type {type(Y1)}") | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "seed,dropout,model_func,kwargs,get_X", | ||||
|     [ | ||||
|         (0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), | ||||
|         (0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs), | ||||
|         (0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), | ||||
|     ], | ||||
| ) | ||||
| def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): | ||||
|     def get_updated_model(): | ||||
|         fix_random_seed(seed) | ||||
|         optimizer = Adam(0.001) | ||||
|         model = model_func(**kwargs).initialize() | ||||
|         initial_params = get_all_params(model) | ||||
|         set_dropout_rate(model, dropout) | ||||
|         for _ in range(5): | ||||
|             Y, get_dX = model.begin_update(get_X()) | ||||
|             dY = get_gradient(model, Y) | ||||
|             _ = get_dX(dY) | ||||
|             model.finish_update(optimizer) | ||||
|         updated_params = get_all_params(model) | ||||
|         with pytest.raises(AssertionError): | ||||
|             assert_array_equal(initial_params, updated_params) | ||||
|         return model | ||||
| 
 | ||||
|     model1 = get_updated_model() | ||||
|     model2 = get_updated_model() | ||||
|     assert_array_equal(get_all_params(model1), get_all_params(model2)) | ||||
							
								
								
									
										31
									
								
								spacy/tests/test_projects.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								spacy/tests/test_projects.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,31 @@ | |||
| import pytest | ||||
| from spacy.cli.project.util import validate_project_commands | ||||
| from spacy.schemas import ProjectConfigSchema, validate | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "config", | ||||
|     [ | ||||
|         {"commands": [{"name": "a"}, {"name": "a"}]}, | ||||
|         {"commands": [{"name": "a"}], "workflows": {"a": []}}, | ||||
|         {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}}, | ||||
|     ], | ||||
| ) | ||||
| def test_project_config_validation1(config): | ||||
|     with pytest.raises(SystemExit): | ||||
|         validate_project_commands(config) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "config,n_errors", | ||||
|     [ | ||||
|         ({"commands": {"a": []}}, 1), | ||||
|         ({"commands": [{"help": "..."}]}, 1), | ||||
|         ({"commands": [{"name": "a", "extra": "b"}]}, 1), | ||||
|         ({"commands": [{"extra": "b"}]}, 2), | ||||
|         ({"commands": [{"name": "a", "deps": [123]}]}, 1), | ||||
|     ], | ||||
| ) | ||||
| def test_project_config_validation2(config, n_errors): | ||||
|     errors = validate(ProjectConfigSchema, config) | ||||
|     assert len(errors) == n_errors | ||||
|  | @ -803,7 +803,7 @@ cdef class Doc: | |||
|         attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) | ||||
|                  for id_ in attrs] | ||||
|         if array.dtype != numpy.uint64: | ||||
|             warnings.warn(Warnings.W101.format(type=array.dtype)) | ||||
|             warnings.warn(Warnings.W028.format(type=array.dtype)) | ||||
| 
 | ||||
|         if SENT_START in attrs and HEAD in attrs: | ||||
|             raise ValueError(Errors.E032) | ||||
|  |  | |||
|  | @ -20,7 +20,6 @@ import subprocess | |||
| from contextlib import contextmanager | ||||
| import tempfile | ||||
| import shutil | ||||
| import hashlib | ||||
| import shlex | ||||
| 
 | ||||
| try: | ||||
|  | @ -449,6 +448,16 @@ def split_command(command: str) -> List[str]: | |||
|     return shlex.split(command, posix=not is_windows) | ||||
| 
 | ||||
| 
 | ||||
| def join_command(command: List[str]) -> str: | ||||
|     """Join a command using shlex. shlex.join is only available for Python 3.8+, | ||||
|     so we're using a workaround here. | ||||
| 
 | ||||
|     command (List[str]): The command to join. | ||||
|     RETURNS (str): The joined command | ||||
|     """ | ||||
|     return " ".join(shlex.quote(cmd) for cmd in command) | ||||
| 
 | ||||
| 
 | ||||
| def run_command(command: Union[str, List[str]]) -> None: | ||||
|     """Run a command on the command line as a subprocess. If the subprocess | ||||
|     returns a non-zero exit code, a system exit is performed. | ||||
|  | @ -501,23 +510,13 @@ def make_tempdir(): | |||
|         warnings.warn(Warnings.W091.format(dir=d, msg=e)) | ||||
| 
 | ||||
| 
 | ||||
| def get_hash(data) -> str: | ||||
|     """Get the hash for a JSON-serializable object. | ||||
| def is_cwd(path: Union[Path, str]) -> bool: | ||||
|     """Check whether a path is the current working directory. | ||||
| 
 | ||||
|     data: The data to hash. | ||||
|     RETURNS (str): The hash. | ||||
|     path (Union[Path, str]): The directory path. | ||||
|     RETURNS (bool): Whether the path is the current working directory. | ||||
|     """ | ||||
|     data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") | ||||
|     return hashlib.md5(data_str).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| def get_checksum(path: Union[Path, str]) -> str: | ||||
|     """Get the checksum for a file given its file path. | ||||
| 
 | ||||
|     path (Union[Path, str]): The file path. | ||||
|     RETURNS (str): The checksum. | ||||
|     """ | ||||
|     return hashlib.md5(Path(path).read_bytes()).hexdigest() | ||||
|     return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower() | ||||
| 
 | ||||
| 
 | ||||
| def is_in_jupyter(): | ||||
|  | @ -722,6 +721,51 @@ def minibatch(items, size=8): | |||
|         yield list(batch) | ||||
| 
 | ||||
| 
 | ||||
| def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False): | ||||
|     if isinstance(size, int): | ||||
|         size_ = itertools.repeat(size) | ||||
|     else: | ||||
|         size_ = size | ||||
|     for outer_batch in minibatch(docs, buffer): | ||||
|         outer_batch = list(outer_batch) | ||||
|         target_size = next(size_) | ||||
|         for indices in _batch_by_length(outer_batch, target_size): | ||||
|             subbatch = [outer_batch[i] for i in indices] | ||||
|             padded_size = max(len(seq) for seq in subbatch) * len(subbatch) | ||||
|             if discard_oversize and padded_size >= target_size: | ||||
|                 pass | ||||
|             else: | ||||
|                 yield subbatch | ||||
| 
 | ||||
| 
 | ||||
| def _batch_by_length(seqs, max_words): | ||||
|     """Given a list of sequences, return a batched list of indices into the | ||||
|     list, where the batches are grouped by length, in descending order. | ||||
| 
 | ||||
|     Batches may be at most max_words in size, defined as max sequence length * size. | ||||
|     """ | ||||
|     # Use negative index so we can get sort by position ascending. | ||||
|     lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)] | ||||
|     lengths_indices.sort() | ||||
|     batches = [] | ||||
|     batch = [] | ||||
|     for length, i in lengths_indices: | ||||
|         if not batch: | ||||
|             batch.append(i) | ||||
|         elif length * (len(batch) + 1) <= max_words: | ||||
|             batch.append(i) | ||||
|         else: | ||||
|             batches.append(batch) | ||||
|             batch = [i] | ||||
|     if batch: | ||||
|         batches.append(batch) | ||||
|     # Check lengths match | ||||
|     assert sum(len(b) for b in batches) == len(seqs) | ||||
|     batches = [list(sorted(batch)) for batch in batches] | ||||
|     batches.reverse() | ||||
|     return batches | ||||
| 
 | ||||
| 
 | ||||
| def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): | ||||
|     """Create minibatches of roughly a given number of words. If any examples | ||||
|     are longer than the specified batch length, they will appear in a batch by | ||||
|  | @ -768,7 +812,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): | |||
| 
 | ||||
|         # yield the previous batch and start a new one. The new one gets the overflow examples. | ||||
|         else: | ||||
|             yield batch | ||||
|             if batch: | ||||
|                 yield batch | ||||
|             target_size = next(size_) | ||||
|             tol_size = target_size * tolerance | ||||
|             batch = overflow | ||||
|  | @ -788,15 +833,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): | |||
| 
 | ||||
|             # this example does not fit with the previous overflow: start another new batch | ||||
|             else: | ||||
|                 yield batch | ||||
|                 if batch: | ||||
|                     yield batch | ||||
|                 target_size = next(size_) | ||||
|                 tol_size = target_size * tolerance | ||||
|                 batch = [doc] | ||||
|                 batch_size = n_words | ||||
| 
 | ||||
|     # yield the final batch | ||||
|     batch.extend(overflow) | ||||
|     if batch: | ||||
|         batch.extend(overflow) | ||||
|         yield batch | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library | |||
| source: spacy/ml/models | ||||
| --- | ||||
| 
 | ||||
| TODO: write | ||||
| TODO: intro and how architectures work, link to | ||||
| [`registry`](/api/top-level#registry), | ||||
| [custom models](/usage/training#custom-models) usage etc. | ||||
| 
 | ||||
| ## Parser architectures {source="spacy/ml/models/parser.py"} | ||||
| 
 | ||||
| ### spacy.TransitionBasedParser.v1 | ||||
| 
 | ||||
| <!-- TODO: intro --> | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TransitionBasedParser.v1" | ||||
| > nr_feature_tokens = 6 | ||||
| > hidden_width = 64 | ||||
| > maxout_pieces = 2 | ||||
| > | ||||
| > [model.tok2vec] | ||||
| > # ... | ||||
| > ``` | ||||
| 
 | ||||
| | Name                | Type                                       | Description | | ||||
| | ------------------- | ------------------------------------------ | ----------- | | ||||
| | `tok2vec`           | [`Model`](https://thinc.ai/docs/api-model) |             | | ||||
| | `nr_feature_tokens` | int                                        |             | | ||||
| | `hidden_width`      | int                                        |             | | ||||
| | `maxout_pieces`     | int                                        |             | | ||||
| | `use_upper`         | bool                                       |             | | ||||
| | `nO`                | int                                        |             | | ||||
|  |  | |||
|  | @ -297,60 +297,41 @@ will not be available. | |||
| 
 | ||||
| ## Train {#train} | ||||
| 
 | ||||
| <!-- TODO: document new training --> | ||||
| 
 | ||||
| Train a model. Expects data in spaCy's | ||||
| [JSON format](/api/data-formats#json-input). On each epoch, a model will be | ||||
| saved out to the directory. Accuracy scores and model details will be added to a | ||||
| [`meta.json`](/usage/training#models-generating) to allow packaging the model | ||||
| using the [`package`](/api/cli#package) command. | ||||
| [binary format](/api/data-formats#training) and a | ||||
| [config file](/api/data-formats#config) with all settings and hyperparameters. | ||||
| Will save out the best model from all epochs, as well as the final model. The | ||||
| `--code` argument can be used to provide a Python file that's imported before | ||||
| the training process starts. This lets you register | ||||
| [custom functions](/usage/training#custom-models) and architectures and refer to | ||||
| them in your config, all while still using spaCy's built-in `train` workflow. If | ||||
| you need to manage complex multi-step training workflows, check out the new | ||||
| [spaCy projects](/usage/projects). | ||||
| 
 | ||||
| <Infobox title="New in v3.0" variant="warning"> | ||||
| 
 | ||||
| As of spaCy v3.0, the `train` command doesn't take a long list of command-line | ||||
| arguments anymore and instead expects a single | ||||
| [`config.cfg` file](/usage/training#config) containing all settings for the | ||||
| pipeline, training process and hyperparameters. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | ||||
| [--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] | ||||
| [--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec] | ||||
| [--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level] | ||||
| [--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel] | ||||
| [--textcat-positive-label] [--verbose] | ||||
| $ python -m spacy train [train_path] [dev_path] [config_path] [--output] | ||||
| [--code] [--verbose] | ||||
| ``` | ||||
| 
 | ||||
| | Argument                                                        | Type          | Description                                                                                                                                                       | | ||||
| | --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `lang`                                                          | positional    | Model language.                                                                                                                                                   | | ||||
| | `output_path`                                                   | positional    | Directory to store model in. Will be created if it doesn't exist.                                                                                                 | | ||||
| | `train_path`                                                    | positional    | Location of JSON-formatted training data. Can be a file or a directory of files.                                                                                  | | ||||
| | `dev_path`                                                      | positional    | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files.                                                                | | ||||
| | `--base-model`, `-b` <Tag variant="new">2.1</Tag>               | option        | Optional name of base model to update. Can be any loadable spaCy model.                                                                                           | | ||||
| | `--pipeline`, `-p` <Tag variant="new">2.1</Tag>                 | option        | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.                                                                         | | ||||
| | `--replace-components`, `-R`                                    | flag          | Replace components from the base model.                                                                                                                           | | ||||
| | `--vectors`, `-v`                                               | option        | Model to load vectors from.                                                                                                                                       | | ||||
| | `--n-iter`, `-n`                                                | option        | Number of iterations (default: `30`).                                                                                                                             | | ||||
| | `--n-early-stopping`, `-ne`                                     | option        | Maximum number of training epochs without dev accuracy improvement.                                                                                               | | ||||
| | `--n-examples`, `-ns`                                           | option        | Number of examples to use (defaults to `0` for all examples).                                                                                                     | | ||||
| | `--use-gpu`, `-g`                                               | option        | GPU ID or `-1` for CPU only (default: `-1`).                                                                                                                      | | ||||
| | `--version`, `-V`                                               | option        | Model version. Will be written out to the model's `meta.json` after training.                                                                                     | | ||||
| | `--meta-path`, `-m` <Tag variant="new">2</Tag>                  | option        | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. | | ||||
| | `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag>           | option        | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.                                                       | | ||||
| | `--parser-multitasks`, `-pt`                                    | option        | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                       | | ||||
| | `--entity-multitasks`, `-et`                                    | option        | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                          | | ||||
| | `--width`, `-cw` <Tag variant="new">2.2.4</Tag>                 | option        | Width of CNN layers of `Tok2Vec` component.                                                                                                                       | | ||||
| | `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag>            | option        | Depth of CNN layers of `Tok2Vec` component.                                                                                                                       | | ||||
| | `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag>            | option        | Window size for CNN layers of `Tok2Vec` component.                                                                                                                | | ||||
| | `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag>            | option        | Maxout size for CNN layers of `Tok2Vec` component.                                                                                                                | | ||||
| | `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag>            | flag          | Whether to use character-based embedding of `Tok2Vec` component.                                                                                                  | | ||||
| | `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag>        | option        | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch).                                                                                                 | | ||||
| | `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag>            | option        | Number of embedding rows of `Tok2Vec` component.                                                                                                                  | | ||||
| | `--noise-level`, `-nl`                                          | option        | Float indicating the amount of corruption for data augmentation.                                                                                                  | | ||||
| | `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag>     | option        | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement).                | | ||||
| | `--gold-preproc`, `-G`                                          | flag          | Use gold preprocessing.                                                                                                                                           | | ||||
| | `--learn-tokens`, `-T`                                          | flag          | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese.                                                   | | ||||
| | `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag>     | flag          | Text classification classes aren't mutually exclusive (multilabel).                                                                                               | | ||||
| | `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag>            | option        | Text classification model architecture. Defaults to `"bow"`.                                                                                                      | | ||||
| | `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option        | Text classification positive label for binary classes with two labels.                                                                                            | | ||||
| | `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag>          | option        | Location of JSON-formatted tag map.                                                                                                                               | | ||||
| | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag>              | flag          | Show more detailed messages during training.                                                                                                                      | | ||||
| | `--help`, `-h`                                                  | flag          | Show help message and available arguments.                                                                                                                        | | ||||
| | **CREATES**                                                     | model, pickle | A spaCy model on each epoch.                                                                                                                                      | | ||||
| | Argument          | Type       | Description                                                                                                                                          | | ||||
| | ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `train_path`      | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files.                             | | ||||
| | `dev_path`        | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files.           | | ||||
| | `config_path`     | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                | | ||||
| | `--output`, `-o`  | positional | Directory to store model in. Will be created if it doesn't exist.                                                                                    | | ||||
| | `--code`, `-c`    | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | | ||||
| | `--verbose`, `-V` | flag       | Show more detailed messages during training.                                                                                                         | | ||||
| | `--help`, `-h`    | flag       | Show help message and available arguments.                                                                                                           | | ||||
| | **CREATES**       | model      | The final model and the best model.                                                                                                                  | | ||||
| 
 | ||||
| ## Pretrain {#pretrain new="2.1" tag="experimental"} | ||||
| 
 | ||||
|  | @ -471,20 +452,20 @@ as separate files if the respective component is present in the model's | |||
| pipeline. | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] | ||||
| [--gpu-id] [--gold-preproc] [--return-scores] | ||||
| $ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path] | ||||
| [--displacy-limit] [--gpu-id] [--gold-preproc] | ||||
| ``` | ||||
| 
 | ||||
| | Argument                  | Type           | Description                                                                                                                                              | | ||||
| | ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `model`                   | positional     | Model to evaluate. Can be a package or a path to a model data directory.                                                                                 | | ||||
| | `data_path`               | positional     | Location of JSON-formatted evaluation data.                                                                                                              | | ||||
| | `--displacy-path`, `-dp`  | option         | Directory to output rendered parses as HTML. If not set, no visualizations will be generated.                                                            | | ||||
| | `--displacy-limit`, `-dl` | option         | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | | ||||
| | `--gpu-id`, `-g`          | option         | GPU to use, if any. Defaults to `-1` for CPU.                                                                                                            | | ||||
| | `--gold-preproc`, `-G`    | flag           | Use gold preprocessing.                                                                                                                                  | | ||||
| | `--return-scores`, `-R`   | flag           | Return dict containing model scores.                                                                                                                     | | ||||
| | **CREATES**               | `stdout`, HTML | Training results and optional displaCy visualizations.                                                                                                   | | ||||
| | Argument                  | Type                 | Description                                                                                                                                              | | ||||
| | ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `model`                   | positional           | Model to evaluate. Can be a package or a path to a model data directory.                                                                                 | | ||||
| | `data_path`               | positional           | Location of evaluation data in spaCy's [binary format](/api/data-formats#training).                                                                      | | ||||
| | `--output`, `-o`          | option               | Output JSON file for metrics. If not set, no metrics will be exported.                                                                                   | | ||||
| | `--displacy-path`, `-dp`  | option               | Directory to output rendered parses as HTML. If not set, no visualizations will be generated.                                                            | | ||||
| | `--displacy-limit`, `-dl` | option               | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | | ||||
| | `--gpu-id`, `-g`          | option               | GPU to use, if any. Defaults to `-1` for CPU.                                                                                                            | | ||||
| | `--gold-preproc`, `-G`    | flag                 | Use gold preprocessing.                                                                                                                                  | | ||||
| | **CREATES**               | `stdout`, JSON, HTML | Training results and optional metrics and visualizations.                                                                                                | | ||||
| 
 | ||||
| ## Package {#package} | ||||
| 
 | ||||
|  | @ -504,15 +485,17 @@ so you don't have to run `python setup.py sdist` separately anymore. | |||
| </Infobox> | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] | ||||
| $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] | ||||
| [--version] [--force] | ||||
| ``` | ||||
| 
 | ||||
| ```bash | ||||
| ### Example | ||||
| python -m spacy package /input /output | ||||
| cd /output/en_model-0.0.0 | ||||
| pip install dist/en_model-0.0.0.tar.gz | ||||
| ``` | ||||
| > #### Example | ||||
| > | ||||
| > ```bash | ||||
| > python -m spacy package /input /output | ||||
| > cd /output/en_model-0.0.0 | ||||
| > pip install dist/en_model-0.0.0.tar.gz | ||||
| > ``` | ||||
| 
 | ||||
| | Argument                                         | Type       | Description                                                                                                                                                                                     | | ||||
| | ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
|  | @ -525,18 +508,137 @@ pip install dist/en_model-0.0.0.tar.gz | |||
| | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                                                                                                      | | ||||
| | **CREATES**                                      | directory  | A Python package containing the spaCy model.                                                                                                                                                    | | ||||
| 
 | ||||
| ## Project {#project} | ||||
| ## Project {#project new="3"} | ||||
| 
 | ||||
| <!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design --> | ||||
| The `spacy project` CLI includes subcommands for working with | ||||
| [spaCy projects](/usage/projects), end-to-end workflows for building and | ||||
| deploying custom spaCy models. | ||||
| 
 | ||||
| ### project clone {#project-clone} | ||||
| 
 | ||||
| Clone a project template from a Git repository. Calls into `git` under the hood | ||||
| and uses the sparse checkout feature, so you're only downloading what you need. | ||||
| By default, spaCy's | ||||
| [project templates repo](https://github.com/explosion/projects) is used, but you | ||||
| can provide any other repo (public or private) that you have access to using the | ||||
| `--repo` option. | ||||
| 
 | ||||
| <!-- TODO: update example once we've decided on repo structure --> | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy project clone [name] [dest] [--repo] | ||||
| ``` | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```bash | ||||
| > $ python -m spacy project clone some_example | ||||
| > ``` | ||||
| > | ||||
| > Clone from custom repo: | ||||
| > | ||||
| > ```bash | ||||
| > $ python -m spacy project clone template --repo https://github.com/your_org/your_repo | ||||
| > ``` | ||||
| 
 | ||||
| | Argument       | Type       | Description                                                                                                                  | | ||||
| | -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `name`         | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. | | ||||
| | `dest`         | positional | Where to clone the project. Defaults to current working directory.                                                           | | ||||
| | `--repo`, `-r` | option     | The repository to clone from. Can be any public or private Git repo you have access to.                                      | | ||||
| | `--help`, `-h` | flag       | Show help message and available arguments.                                                                                   | | ||||
| | **CREATES**    | directory  | The cloned [project directory](/usage/projects#project-files).                                                               | | ||||
| 
 | ||||
| ### project assets {#project-assets} | ||||
| 
 | ||||
| ### project run-all {#project-run-all} | ||||
| Fetch project assets like datasets and pretrained weights. Assets are defined in | ||||
| the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a | ||||
| `checksum` is provided, the file is only downloaded if no local file with the | ||||
| same checksum exists and spaCy will show an error if the checksum of the | ||||
| downloaded file doesn't match. If assets don't specify a `url` they're | ||||
| considered "private" and you have to take care of putting them into the | ||||
| destination directory yourself. If a local path is provided, the asset is copied | ||||
| into the current project. | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy project assets [project_dir] | ||||
| ``` | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```bash | ||||
| > $ python -m spacy project assets | ||||
| > ``` | ||||
| 
 | ||||
| | Argument       | Type       | Description                                                       | | ||||
| | -------------- | ---------- | ----------------------------------------------------------------- | | ||||
| | `project_dir`  | positional | Path to project directory. Defaults to current working directory. | | ||||
| | `--help`, `-h` | flag       | Show help message and available arguments.                        | | ||||
| | **CREATES**    | files      | Downloaded or copied assets defined in the `project.yml`.         | | ||||
| 
 | ||||
| ### project run {#project-run} | ||||
| 
 | ||||
| ### project init {#project-init} | ||||
| Run a named command or workflow defined in the | ||||
| [`project.yml`](/usage/projects#project-yml). If a workflow name is specified, | ||||
| all commands in the workflow are run, in order. If commands define | ||||
| [dependencies or outputs](/usage/projects#deps-outputs), they will only be | ||||
| re-run if state has changed. For example, if the input dataset changes, a | ||||
| preprocessing command that depends on those files will be re-run. | ||||
| 
 | ||||
| ### project update-dvc {#project-update-dvc} | ||||
| ```bash | ||||
| $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry] | ||||
| ``` | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```bash | ||||
| > $ python -m spacy project run train | ||||
| > ``` | ||||
| 
 | ||||
| | Argument        | Type       | Description                                                       | | ||||
| | --------------- | ---------- | ----------------------------------------------------------------- | | ||||
| | `subcommand`    | positional | Name of the command or workflow to run.                           | | ||||
| | `project_dir`   | positional | Path to project directory. Defaults to current working directory. | | ||||
| | `--force`, `-F` | flag       | Force re-running steps, even if nothing changed.                  | | ||||
| | `--dry`, `-D`   | flag       |  Perform a dry run and don't execute scripts.                     | | ||||
| | `--help`, `-h`  | flag       | Show help message and available arguments.                        | | ||||
| 
 | ||||
| ### project dvc {#project-dvc} | ||||
| 
 | ||||
| Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls | ||||
| [`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under | ||||
| the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline, | ||||
| so you need to specify one workflow defined in the | ||||
| [`project.yml`](/usage/projects#project-yml). If no workflow is specified, the | ||||
| first defined workflow is used. The DVC config will only be updated if the | ||||
| `project.yml` changed. For details, see the | ||||
| [DVC integration](/usage/projects#dvc) docs. | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
| 
 | ||||
| This command requires DVC to be installed and initialized in the project | ||||
| directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init). | ||||
| You'll also need to add the assets you want to track with | ||||
| [`dvc add`](https://dvc.org/doc/command-reference/add). | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] | ||||
| ``` | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```bash | ||||
| > git init | ||||
| > dvc init | ||||
| > python -m spacy project dvc all | ||||
| > ``` | ||||
| 
 | ||||
| | Argument          | Type       | Description                                                                       | | ||||
| | ----------------- | ---------- | --------------------------------------------------------------------------------- | | ||||
| | `project_dir`     | positional | Path to project directory. Defaults to current working directory.                 | | ||||
| | `workflow`        | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. | | ||||
| | `--force`, `-F`   | flag       | Force-updating config file.                                                       | | ||||
| | `--verbose`, `-V` | flag       |  Print more output generated by DVC.                                              | | ||||
| | `--help`, `-h`    | flag       | Show help message and available arguments.                                        | | ||||
|  |  | |||
|  | @ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to | |||
| follow — only to succumb themselves. In short, just say no to optimizing your | ||||
| Python. If it's not fast enough the first time, just switch to Cython. | ||||
| 
 | ||||
| <Infobox title="📖 Resources"> | ||||
| <Infobox title="Resources" emoji="📖"> | ||||
| 
 | ||||
| - [Official Cython documentation](http://docs.cython.org/en/latest/) | ||||
|   (cython.org) | ||||
|  |  | |||
|  | @ -2,7 +2,8 @@ | |||
| title: Data formats | ||||
| teaser: Details on spaCy's input and output data formats | ||||
| menu: | ||||
|   - ['Training data', 'training'] | ||||
|   - ['Training Data', 'training'] | ||||
|   - ['Training Config', 'config'] | ||||
|   - ['Vocabulary', 'vocab'] | ||||
| --- | ||||
| 
 | ||||
|  | @ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank: | |||
| https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json | ||||
| ``` | ||||
| 
 | ||||
| ## Training config {#config new="3"} | ||||
| 
 | ||||
| Config files define the training process and model pipeline and can be passed to | ||||
| [`spacy train`](/api/cli#train). They use | ||||
| [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the | ||||
| hood. For details on how to use training configs, see the | ||||
| [usage documentation](/usage/training#config). | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
| 
 | ||||
| The `@` syntax lets you refer to function names registered in the | ||||
| [function registry](/api/top-level#registry). For example, | ||||
| `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of | ||||
| the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block | ||||
| will be passed into that function as arguments. Those arguments depend on the | ||||
| registered function. See the [model architectures](/api/architectures) docs for | ||||
| API details. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| <!-- TODO: we need to come up with a good way to present the sections and their expected values visually? --> | ||||
| <!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command --> | ||||
| 
 | ||||
| ## Lexical data for vocabulary {#vocab-jsonl new="2"} | ||||
| 
 | ||||
| To populate a model's vocabulary, you can use the | ||||
|  |  | |||
|  | @ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline | |||
| component is available in the [processing pipeline](/usage/processing-pipelines) | ||||
| via the ID `"parser"`. | ||||
| 
 | ||||
| ## DependencyParser.Model {#model tag="classmethod"} | ||||
| ## Default config {#config} | ||||
| 
 | ||||
| Initialize a model for the pipe. The model should implement the | ||||
| `thinc.neural.Model` API. Wrappers are under development for most major machine | ||||
| learning libraries. | ||||
| This is the default configuration used to initialize the model powering the | ||||
| pipeline component. See the [model architectures](/api/architectures) | ||||
| documentation for details on the architectures and their arguments and | ||||
| hyperparameters. To learn more about how to customize the config and train | ||||
| custom models, check out the [training config](/usage/training#config) docs. | ||||
| 
 | ||||
| | Name        | Type   | Description                           | | ||||
| | ----------- | ------ | ------------------------------------- | | ||||
| | `**kwargs`  | -      | Parameters for initializing the model | | ||||
| | **RETURNS** | object | The initialized model.                | | ||||
| ```python | ||||
| https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg | ||||
| ``` | ||||
| 
 | ||||
| ## DependencyParser.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe with default model | ||||
| > parser = nlp.create_pipe("parser") | ||||
| > | ||||
| > # Construction via create_pipe with custom model | ||||
| > config = {"model": {"@architectures": "my_parser"}} | ||||
| > parser = nlp.create_pipe("parser", config) | ||||
| > | ||||
| > # Construction from class with custom model from file | ||||
| > from spacy.pipeline import DependencyParser | ||||
| > model = util.load_config("model.cfg", create_objects=True)["model"] | ||||
| > parser = DependencyParser(nlp.vocab, model) | ||||
| > ``` | ||||
| 
 | ||||
| Create a new pipeline instance. In your application, you would normally use a | ||||
| shortcut for this and instantiate the component using its string name and | ||||
| [`nlp.create_pipe`](/api/language#create_pipe). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe | ||||
| > parser = nlp.create_pipe("parser") | ||||
| > | ||||
| > # Construction from class | ||||
| > from spacy.pipeline import DependencyParser | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > parser.from_disk("/path/to/model") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type                          | Description                                                                                                                                           | | ||||
| | ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `vocab`     | `Vocab`                       | The shared vocabulary.                                                                                                                                | | ||||
| | `model`     | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | ||||
| | `**cfg`     | -                             | Configuration parameters.                                                                                                                             | | ||||
| | **RETURNS** | `DependencyParser`            | The newly constructed object.                                                                                                                         | | ||||
| | Name        | Type               | Description                                                                     | | ||||
| | ----------- | ------------------ | ------------------------------------------------------------------------------- | | ||||
| | `vocab`     | `Vocab`            | The shared vocabulary.                                                          | | ||||
| | `model`     | `Model`            | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | ||||
| | `**cfg`     | -                  | Configuration parameters.                                                       | | ||||
| | **RETURNS** | `DependencyParser` | The newly constructed object.                                                   | | ||||
| 
 | ||||
| ## DependencyParser.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
|  | @ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and | |||
| >     pass | ||||
| > ``` | ||||
| 
 | ||||
| | Name         | Type     | Description                                            | | ||||
| | ------------ | -------- | ------------------------------------------------------ | | ||||
| | `stream`     | iterable | A stream of documents.                                 | | ||||
| | `batch_size` | int      | The number of texts to buffer. Defaults to `128`.      | | ||||
| | **YIELDS**   | `Doc`    | Processed documents in the order of the original text. | | ||||
| | Name         | Type            | Description                                            | | ||||
| | ------------ | --------------- | ------------------------------------------------------ | | ||||
| | `stream`     | `Iterable[Doc]` | A stream of documents.                                 | | ||||
| | `batch_size` | int             | The number of texts to buffer. Defaults to `128`.      | | ||||
| | **YIELDS**   | `Doc`           | Processed documents in the order of the original text. | | ||||
| 
 | ||||
| ## DependencyParser.predict {#predict tag="method"} | ||||
| 
 | ||||
|  | @ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. | |||
| 
 | ||||
| | Name        | Type                | Description                                    | | ||||
| | ----------- | ------------------- | ---------------------------------------------- | | ||||
| | `docs`      | iterable            | The documents to predict.                      | | ||||
| | `docs`      | `Iterable[Doc]`     | The documents to predict.                      | | ||||
| | **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). | | ||||
| 
 | ||||
| ## DependencyParser.set_annotations {#set_annotations tag="method"} | ||||
|  | @ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores. | |||
| > parser.set_annotations([doc1, doc2], scores) | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type     | Description                                                | | ||||
| | -------- | -------- | ---------------------------------------------------------- | | ||||
| | `docs`   | iterable | The documents to modify.                                   | | ||||
| | `scores` | -        | The scores to set, produced by `DependencyParser.predict`. | | ||||
| | Name     | Type                | Description                                                | | ||||
| | -------- | ------------------- | ---------------------------------------------------------- | | ||||
| | `docs`   | `Iterable[Doc]`     | The documents to modify.                                   | | ||||
| | `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. | | ||||
| 
 | ||||
| ## DependencyParser.update {#update tag="method"} | ||||
| 
 | ||||
| Learn from a batch of documents and gold-standard information, updating the | ||||
| pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and | ||||
| Learn from a batch of [`Example`](/api/example) objects, updating the pipe's | ||||
| model. Delegates to [`predict`](/api/dependencyparser#predict) and | ||||
| [`get_loss`](/api/dependencyparser#get_loss). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > losses = {} | ||||
| > parser = DependencyParser(nlp.vocab, parser_model) | ||||
| > optimizer = nlp.begin_training() | ||||
| > parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) | ||||
| > losses = parser.update(examples, sgd=optimizer) | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type     | Description                                                                                  | | ||||
| | -------- | -------- | -------------------------------------------------------------------------------------------- | | ||||
| | `docs`   | iterable | A batch of documents to learn from.                                                          | | ||||
| | `golds`  | iterable | The gold-standard data. Must have the same length as `docs`.                                 | | ||||
| | `drop`   | float    | The dropout rate.                                                                            | | ||||
| | `sgd`    | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID.       | | ||||
| | `losses` | dict     | Optional record of the loss during training. The value keyed by the model's name is updated. | | ||||
| | Name              | Type                | Description                                                                                                                                    | | ||||
| | ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`        | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from.                                                                                    | | ||||
| | _keyword-only_    |                     |                                                                                                                                                | | ||||
| | `drop`            | float               | The dropout rate.                                                                                                                              | | ||||
| | `set_annotations` | bool                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). | | ||||
| | `sgd`             | `Optimizer`         | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.                                                                                | | ||||
| | `losses`          | `Dict[str, float]`  | Optional record of the loss during training. The value keyed by the model's name is updated.                                                   | | ||||
| | **RETURNS**       | `Dict[str, float]`  | The updated `losses` dictionary.                                                                                                               | | ||||
| 
 | ||||
| ## DependencyParser.get_loss {#get_loss tag="method"} | ||||
| 
 | ||||
|  | @ -156,21 +162,20 @@ predicted scores. | |||
| > | ||||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > scores = parser.predict([doc1, doc2]) | ||||
| > loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores) | ||||
| > scores = parser.predict([eg.predicted for eg in examples]) | ||||
| > loss, d_loss = parser.get_loss(examples, scores) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                  | | ||||
| | ----------- | -------- | ------------------------------------------------------------ | | ||||
| | `docs`      | iterable | The batch of documents.                                      | | ||||
| | `golds`     | iterable | The gold-standard data. Must have the same length as `docs`. | | ||||
| | `scores`    | -        | Scores representing the model's predictions.                 | | ||||
| | **RETURNS** | tuple    | The loss and the gradient, i.e. `(loss, gradient)`.          | | ||||
| | Name        | Type                | Description                                         | | ||||
| | ----------- | ------------------- | --------------------------------------------------- | | ||||
| | `examples`  | `Iterable[Example]` | The batch of examples.                              | | ||||
| | `scores`    | `syntax.StateClass` | Scores representing the model's predictions.        | | ||||
| | **RETURNS** | tuple               | The loss and the gradient, i.e. `(loss, gradient)`. | | ||||
| 
 | ||||
| ## DependencyParser.begin_training {#begin_training tag="method"} | ||||
| 
 | ||||
| Initialize the pipe for training, using data examples if available. If no model | ||||
| has been initialized yet, the model is added. | ||||
| Initialize the pipe for training, using data examples if available. Return an | ||||
| [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -180,16 +185,17 @@ has been initialized yet, the model is added. | |||
| > optimizer = parser.begin_training(pipeline=nlp.pipeline) | ||||
| > ``` | ||||
| 
 | ||||
| | Name          | Type     | Description                                                                                                                                                                                 | | ||||
| | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects.                                                                                           | | ||||
| | `pipeline`    | list     | Optional list of pipeline components that this component is part of.                                                                                                                        | | ||||
| | `sgd`         | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. | | ||||
| | **RETURNS**   | callable | An optimizer.                                                                                                                                                                               | | ||||
| | Name           | Type                    | Description                                                                                                                                                          | | ||||
| | -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `get_examples` | `Iterable[Example]`     | Optional gold-standard annotations in the form of [`Example`](/api/example) objects.                                                                                 | | ||||
| | `pipeline`     | `List[(str, callable)]` | Optional list of pipeline components that this component is part of.                                                                                                 | | ||||
| | `sgd`          | `Optimizer`             | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. | | ||||
| | **RETURNS**    | `Optimizer`             | An optimizer.                                                                                                                                                        | | ||||
| 
 | ||||
| ## DependencyParser.create_optimizer {#create_optimizer tag="method"} | ||||
| 
 | ||||
| Create an optimizer for the pipeline component. | ||||
| Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline | ||||
| component. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -198,9 +204,9 @@ Create an optimizer for the pipeline component. | |||
| > optimizer = parser.create_optimizer() | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description    | | ||||
| | ----------- | -------- | -------------- | | ||||
| | **RETURNS** | callable | The optimizer. | | ||||
| | Name        | Type        | Description                                                     | | ||||
| | ----------- | ----------- | --------------------------------------------------------------- | | ||||
| | **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | | ||||
| 
 | ||||
| ## DependencyParser.use_params {#use_params tag="method, contextmanager"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline | |||
| component is available in the [processing pipeline](/usage/processing-pipelines) | ||||
| via the ID `"entity_linker"`. | ||||
| 
 | ||||
| ## EntityLinker.Model {#model tag="classmethod"} | ||||
| ## Default config {#config} | ||||
| 
 | ||||
| Initialize a model for the pipe. The model should implement the | ||||
| `thinc.neural.Model` API, and should contain a field `tok2vec` that contains the | ||||
| context encoder. Wrappers are under development for most major machine learning | ||||
| libraries. | ||||
| This is the default configuration used to initialize the model powering the | ||||
| pipeline component. See the [model architectures](/api/architectures) | ||||
| documentation for details on the architectures and their arguments and | ||||
| hyperparameters. To learn more about how to customize the config and train | ||||
| custom models, check out the [training config](/usage/training#config) docs. | ||||
| 
 | ||||
| | Name        | Type   | Description                           | | ||||
| | ----------- | ------ | ------------------------------------- | | ||||
| | `**kwargs`  | -      | Parameters for initializing the model | | ||||
| | **RETURNS** | object | The initialized model.                | | ||||
| ```python | ||||
| https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg | ||||
| ``` | ||||
| 
 | ||||
| ## EntityLinker.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe with default model | ||||
| > entity_linker = nlp.create_pipe("entity_linker") | ||||
| > | ||||
| > # Construction via create_pipe with custom model | ||||
| > config = {"model": {"@architectures": "my_el"}} | ||||
| > entity_linker = nlp.create_pipe("entity_linker", config) | ||||
| > | ||||
| > # Construction from class with custom model from file | ||||
| > from spacy.pipeline import EntityLinker | ||||
| > model = util.load_config("model.cfg", create_objects=True)["model"] | ||||
| > entity_linker = EntityLinker(nlp.vocab, model) | ||||
| > ``` | ||||
| 
 | ||||
| Create a new pipeline instance. In your application, you would normally use a | ||||
| shortcut for this and instantiate the component using its string name and | ||||
| [`nlp.create_pipe`](/api/language#create_pipe). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe | ||||
| > entity_linker = nlp.create_pipe("entity_linker") | ||||
| > | ||||
| > # Construction from class | ||||
| > from spacy.pipeline import EntityLinker | ||||
| > entity_linker = EntityLinker(nlp.vocab) | ||||
| > entity_linker.from_disk("/path/to/model") | ||||
| > ``` | ||||
| | Name    | Type    | Description                                                                     | | ||||
| | ------- | ------- | ------------------------------------------------------------------------------- | | ||||
| | `vocab` | `Vocab` | The shared vocabulary.                                                          | | ||||
| | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | ||||
| | `**cfg` | -       | Configuration parameters.                                                       | | ||||
| 
 | ||||
| | Name           | Type                          | Description                                                                                                                                           | | ||||
| | -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `vocab`        | `Vocab`                       | The shared vocabulary.                                                                                                                                | | ||||
| | `model`        | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | ||||
| | `hidden_width` | int                           | Width of the hidden layer of the entity linking model, defaults to `128`.                                                                             | | ||||
| | `incl_prior`   | bool                          | Whether or not to include prior probabilities in the model. Defaults to `True`.                                                                       | | ||||
| | `incl_context` | bool                          | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`.                             | | ||||
| | **RETURNS**    | `EntityLinker`                | The newly constructed object.                                                                                                                         | | ||||
| | **RETURNS** | `EntityLinker` | The newly constructed object. | | ||||
| 
 | ||||
| ## EntityLinker.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
|  | @ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | |||
| >     pass | ||||
| > ``` | ||||
| 
 | ||||
| | Name         | Type     | Description                                            | | ||||
| | ------------ | -------- | ------------------------------------------------------ | | ||||
| | `stream`     | iterable | A stream of documents.                                 | | ||||
| | `batch_size` | int      | The number of texts to buffer. Defaults to `128`.      | | ||||
| | **YIELDS**   | `Doc`    | Processed documents in the order of the original text. | | ||||
| | Name         | Type            | Description                                            | | ||||
| | ------------ | --------------- | ------------------------------------------------------ | | ||||
| | `stream`     | `Iterable[Doc]` | A stream of documents.                                 | | ||||
| | `batch_size` | int             | The number of texts to buffer. Defaults to `128`.      | | ||||
| | **YIELDS**   | `Doc`           | Processed documents in the order of the original text. | | ||||
| 
 | ||||
| ## EntityLinker.predict {#predict tag="method"} | ||||
| 
 | ||||
|  | @ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them. | |||
| > | ||||
| > ```python | ||||
| > entity_linker = EntityLinker(nlp.vocab) | ||||
| > kb_ids, tensors = entity_linker.predict([doc1, doc2]) | ||||
| > kb_ids = entity_linker.predict([doc1, doc2]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                                                                                                                                                        | | ||||
| | ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `docs`      | iterable | The documents to predict.                                                                                                                                                                          | | ||||
| | **RETURNS** | tuple    | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | | ||||
| | Name        | Type            | Description                                                  | | ||||
| | ----------- | --------------- | ------------------------------------------------------------ | | ||||
| | `docs`      | `Iterable[Doc]` | The documents to predict.                                    | | ||||
| | **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. | | ||||
| 
 | ||||
| ## EntityLinker.set_annotations {#set_annotations tag="method"} | ||||
| 
 | ||||
|  | @ -122,19 +125,18 @@ entities. | |||
| > | ||||
| > ```python | ||||
| > entity_linker = EntityLinker(nlp.vocab) | ||||
| > kb_ids, tensors = entity_linker.predict([doc1, doc2]) | ||||
| > entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) | ||||
| > kb_ids = entity_linker.predict([doc1, doc2]) | ||||
| > entity_linker.set_annotations([doc1, doc2], kb_ids) | ||||
| > ``` | ||||
| 
 | ||||
| | Name      | Type     | Description                                                                                       | | ||||
| | --------- | -------- | ------------------------------------------------------------------------------------------------- | | ||||
| | `docs`    | iterable | The documents to modify.                                                                          | | ||||
| | `kb_ids`  | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | | ||||
| | `tensors` | iterable | The token representations used to predict the identifiers.                                        | | ||||
| | Name     | Type            | Description                                                                                       | | ||||
| | -------- | --------------- | ------------------------------------------------------------------------------------------------- | | ||||
| | `docs`   | `Iterable[Doc]` | The documents to modify.                                                                          | | ||||
| | `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | | ||||
| 
 | ||||
| ## EntityLinker.update {#update tag="method"} | ||||
| 
 | ||||
| Learn from a batch of documents and gold-standard information, updating both the | ||||
| Learn from a batch of [`Example`](/api/example) objects, updating both the | ||||
| pipe's entity linking model and context encoder. Delegates to | ||||
| [`predict`](/api/entitylinker#predict) and | ||||
| [`get_loss`](/api/entitylinker#get_loss). | ||||
|  | @ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to | |||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > entity_linker = EntityLinker(nlp.vocab) | ||||
| > losses = {} | ||||
| > entity_linker = EntityLinker(nlp.vocab, nel_model) | ||||
| > optimizer = nlp.begin_training() | ||||
| > entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) | ||||
| > losses = entity_linker.update(examples, sgd=optimizer) | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type     | Description                                                                                             | | ||||
| | -------- | -------- | ------------------------------------------------------------------------------------------------------- | | ||||
| | `docs`   | iterable | A batch of documents to learn from.                                                                     | | ||||
| | `golds`  | iterable | The gold-standard data. Must have the same length as `docs`.                                            | | ||||
| | `drop`   | float    | The dropout rate, used both for the EL model and the context encoder.                                   | | ||||
| | `sgd`    | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | | ||||
| | `losses` | dict     | Optional record of the loss during training. The value keyed by the model's name is updated.            | | ||||
| 
 | ||||
| ## EntityLinker.get_loss {#get_loss tag="method"} | ||||
| 
 | ||||
| Find the loss and gradient of loss for the entities in a batch of documents and | ||||
| their predicted scores. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > entity_linker = EntityLinker(nlp.vocab) | ||||
| > kb_ids, tensors = entity_linker.predict(docs) | ||||
| > loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                  | | ||||
| | ----------- | -------- | ------------------------------------------------------------ | | ||||
| | `docs`      | iterable | The batch of documents.                                      | | ||||
| | `golds`     | iterable | The gold-standard data. Must have the same length as `docs`. | | ||||
| | `kb_ids`    | iterable | KB identifiers representing the model's predictions.         | | ||||
| | `tensors`   | iterable | The token representations used to predict the identifiers    | | ||||
| | **RETURNS** | tuple    | The loss and the gradient, i.e. `(loss, gradient)`.          | | ||||
| | Name              | Type                | Description                                                                                                                                | | ||||
| | ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `examples`        | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from.                                                                                | | ||||
| | _keyword-only_    |                     |                                                                                                                                            | | ||||
| | `drop`            | float               | The dropout rate.                                                                                                                          | | ||||
| | `set_annotations` | bool                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). | | ||||
| | `sgd`             | `Optimizer`         | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.                                                                                | | ||||
| | `losses`          | `Dict[str, float]`  | Optional record of the loss during training. The value keyed by the model's name is updated.                                               | | ||||
| | **RETURNS**       | `Dict[str, float]`  | The updated `losses` dictionary.                                                                                                           | | ||||
| 
 | ||||
| ## EntityLinker.set_kb {#set_kb tag="method"} | ||||
| 
 | ||||
|  | @ -195,9 +177,9 @@ identifiers. | |||
| 
 | ||||
| ## EntityLinker.begin_training {#begin_training tag="method"} | ||||
| 
 | ||||
| Initialize the pipe for training, using data examples if available. If no model | ||||
| has been initialized yet, the model is added. Before calling this method, a | ||||
| knowledge base should have been defined with | ||||
| Initialize the pipe for training, using data examples if available. Return an | ||||
| [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this | ||||
| method, a knowledge base should have been defined with | ||||
| [`set_kb`](/api/entitylinker#set_kb). | ||||
| 
 | ||||
| > #### Example | ||||
|  | @ -209,12 +191,12 @@ knowledge base should have been defined with | |||
| > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) | ||||
| > ``` | ||||
| 
 | ||||
| | Name          | Type     | Description                                                                                                                                                                         | | ||||
| | ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects.                                                                                   | | ||||
| | `pipeline`    | list     | Optional list of pipeline components that this component is part of.                                                                                                                | | ||||
| | `sgd`         | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | | ||||
| | **RETURNS**   | callable | An optimizer.                                                                                                                                                                       | | ||||
| | Name           | Type                    | Description                                                                                                                                                      | | ||||
| | -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `get_examples` | `Iterable[Example]`     | Optional gold-standard annotations in the form of [`Example`](/api/example) objects.                                                                             | | ||||
| | `pipeline`     | `List[(str, callable)]` | Optional list of pipeline components that this component is part of.                                                                                             | | ||||
| | `sgd`          | `Optimizer`             | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. | | ||||
| | **RETURNS**    | `Optimizer`             | An optimizer.                                                                                                                                                    |  | | ||||
| 
 | ||||
| ## EntityLinker.create_optimizer {#create_optimizer tag="method"} | ||||
| 
 | ||||
|  | @ -227,9 +209,9 @@ Create an optimizer for the pipeline component. | |||
| > optimizer = entity_linker.create_optimizer() | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description    | | ||||
| | ----------- | -------- | -------------- | | ||||
| | **RETURNS** | callable | The optimizer. | | ||||
| | Name        | Type        | Description                                                     | | ||||
| | ----------- | ----------- | --------------------------------------------------------------- | | ||||
| | **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | | ||||
| 
 | ||||
| ## EntityLinker.use_params {#use_params tag="method, contextmanager"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline | |||
| component is available in the [processing pipeline](/usage/processing-pipelines) | ||||
| via the ID `"ner"`. | ||||
| 
 | ||||
| ## EntityRecognizer.Model {#model tag="classmethod"} | ||||
| ## Default config {#config} | ||||
| 
 | ||||
| Initialize a model for the pipe. The model should implement the | ||||
| `thinc.neural.Model` API. Wrappers are under development for most major machine | ||||
| learning libraries. | ||||
| This is the default configuration used to initialize the model powering the | ||||
| pipeline component. See the [model architectures](/api/architectures) | ||||
| documentation for details on the architectures and their arguments and | ||||
| hyperparameters. To learn more about how to customize the config and train | ||||
| custom models, check out the [training config](/usage/training#config) docs. | ||||
| 
 | ||||
| | Name        | Type   | Description                           | | ||||
| | ----------- | ------ | ------------------------------------- | | ||||
| | `**kwargs`  | -      | Parameters for initializing the model | | ||||
| | **RETURNS** | object | The initialized model.                | | ||||
| ```python | ||||
| https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg | ||||
| ``` | ||||
| 
 | ||||
| ## EntityRecognizer.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Create a new pipeline instance. In your application, you would normally use a | ||||
| shortcut for this and instantiate the component using its string name and | ||||
| [`nlp.create_pipe`](/api/language#create_pipe). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe | ||||
| > ner = nlp.create_pipe("ner") | ||||
| > | ||||
| > # Construction from class | ||||
| > # Construction via create_pipe with custom model | ||||
| > config = {"model": {"@architectures": "my_ner"}} | ||||
| > parser = nlp.create_pipe("ner", config) | ||||
| > | ||||
| > # Construction from class with custom model from file | ||||
| > from spacy.pipeline import EntityRecognizer | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > ner.from_disk("/path/to/model") | ||||
| > model = util.load_config("model.cfg", create_objects=True)["model"] | ||||
| > ner = EntityRecognizer(nlp.vocab, model) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type                          | Description                                                                                                                                           | | ||||
| | ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `vocab`     | `Vocab`                       | The shared vocabulary.                                                                                                                                | | ||||
| | `model`     | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | ||||
| | `**cfg`     | -                             | Configuration parameters.                                                                                                                             | | ||||
| | **RETURNS** | `EntityRecognizer`            | The newly constructed object.                                                                                                                         | | ||||
| Create a new pipeline instance. In your application, you would normally use a | ||||
| shortcut for this and instantiate the component using its string name and | ||||
| [`nlp.create_pipe`](/api/language#create_pipe). | ||||
| 
 | ||||
| | Name        | Type               | Description                                                                     | | ||||
| | ----------- | ------------------ | ------------------------------------------------------------------------------- | | ||||
| | `vocab`     | `Vocab`            | The shared vocabulary.                                                          | | ||||
| | `model`     | `Model`            | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | ||||
| | `**cfg`     | -                  | Configuration parameters.                                                       | | ||||
| | **RETURNS** | `EntityRecognizer` | The newly constructed object.                                                   | | ||||
| 
 | ||||
| ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
|  | @ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and | |||
| >     pass | ||||
| > ``` | ||||
| 
 | ||||
| | Name         | Type     | Description                                            | | ||||
| | ------------ | -------- | ------------------------------------------------------ | | ||||
| | `stream`     | iterable | A stream of documents.                                 | | ||||
| | `batch_size` | int      | The number of texts to buffer. Defaults to `128`.      | | ||||
| | **YIELDS**   | `Doc`    | Processed documents in the order of the original text. | | ||||
| | Name         | Type            | Description                                            | | ||||
| | ------------ | --------------- | ------------------------------------------------------ | | ||||
| | `stream`     | `Iterable[Doc]` | A stream of documents.                                 | | ||||
| | `batch_size` | int             | The number of texts to buffer. Defaults to `128`.      | | ||||
| | **YIELDS**   | `Doc`           | Processed documents in the order of the original text. | | ||||
| 
 | ||||
| ## EntityRecognizer.predict {#predict tag="method"} | ||||
| 
 | ||||
|  | @ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them. | |||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > scores, tensors = ner.predict([doc1, doc2]) | ||||
| > scores = ner.predict([doc1, doc2]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                                                                                                                                                                                        | | ||||
| | ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `docs`      | iterable | The documents to predict.                                                                                                                                                                                                          | | ||||
| | **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | | ||||
| | Name        | Type               | Description                                                                                                | | ||||
| | ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- | | ||||
| | `docs`      | `Iterable[Doc]`    | The documents to predict.                                                                                  | | ||||
| | **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | | ||||
| 
 | ||||
| ## EntityRecognizer.set_annotations {#set_annotations tag="method"} | ||||
| 
 | ||||
|  | @ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores. | |||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > scores, tensors = ner.predict([doc1, doc2]) | ||||
| > ner.set_annotations([doc1, doc2], scores, tensors) | ||||
| > scores = ner.predict([doc1, doc2]) | ||||
| > ner.set_annotations([doc1, doc2], scores) | ||||
| > ``` | ||||
| 
 | ||||
| | Name      | Type     | Description                                                | | ||||
| | --------- | -------- | ---------------------------------------------------------- | | ||||
| | `docs`    | iterable | The documents to modify.                                   | | ||||
| | `scores`  | -        | The scores to set, produced by `EntityRecognizer.predict`. | | ||||
| | `tensors` | iterable | The token representations used to predict the scores.      | | ||||
| | Name     | Type               | Description                                                | | ||||
| | -------- | ------------------ | ---------------------------------------------------------- | | ||||
| | `docs`   | `Iterable[Doc]`    | The documents to modify.                                   | | ||||
| | `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. | | ||||
| 
 | ||||
| ## EntityRecognizer.update {#update tag="method"} | ||||
| 
 | ||||
| Learn from a batch of documents and gold-standard information, updating the | ||||
| pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and | ||||
| Learn from a batch of [`Example`](/api/example) objects, updating the pipe's | ||||
| model. Delegates to [`predict`](/api/entityrecognizer#predict) and | ||||
| [`get_loss`](/api/entityrecognizer#get_loss). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > losses = {} | ||||
| > ner = EntityRecognizer(nlp.vocab, ner_model) | ||||
| > optimizer = nlp.begin_training() | ||||
| > ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) | ||||
| > losses = ner.update(examples, sgd=optimizer) | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type     | Description                                                                                  | | ||||
| | -------- | -------- | -------------------------------------------------------------------------------------------- | | ||||
| | `docs`   | iterable | A batch of documents to learn from.                                                          | | ||||
| | `golds`  | iterable | The gold-standard data. Must have the same length as `docs`.                                 | | ||||
| | `drop`   | float    | The dropout rate.                                                                            | | ||||
| | `sgd`    | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID.       | | ||||
| | `losses` | dict     | Optional record of the loss during training. The value keyed by the model's name is updated. | | ||||
| | Name              | Type                | Description                                                                                                                                    | | ||||
| | ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`        | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from.                                                                                    | | ||||
| | _keyword-only_    |                     |                                                                                                                                                | | ||||
| | `drop`            | float               | The dropout rate.                                                                                                                              | | ||||
| | `set_annotations` | bool                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). | | ||||
| | `sgd`             | `Optimizer`         | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.                                                                                | | ||||
| | `losses`          | `Dict[str, float]`  | Optional record of the loss during training. The value keyed by the model's name is updated.                                                   | | ||||
| | **RETURNS**       | `Dict[str, float]`  | The updated `losses` dictionary.                                                                                                               | | ||||
| 
 | ||||
| ## EntityRecognizer.get_loss {#get_loss tag="method"} | ||||
| 
 | ||||
|  | @ -157,21 +162,20 @@ predicted scores. | |||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > scores = ner.predict([doc1, doc2]) | ||||
| > loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores) | ||||
| > scores = ner.predict([eg.predicted for eg in examples]) | ||||
| > loss, d_loss = ner.get_loss(examples, scores) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                  | | ||||
| | ----------- | -------- | ------------------------------------------------------------ | | ||||
| | `docs`      | iterable | The batch of documents.                                      | | ||||
| | `golds`     | iterable | The gold-standard data. Must have the same length as `docs`. | | ||||
| | `scores`    | -        | Scores representing the model's predictions.                 | | ||||
| | **RETURNS** | tuple    | The loss and the gradient, i.e. `(loss, gradient)`.          | | ||||
| | Name        | Type                | Description                                         | | ||||
| | ----------- | ------------------- | --------------------------------------------------- | | ||||
| | `examples`  | `Iterable[Example]` | The batch of examples.                              | | ||||
| | `scores`    | `List[StateClass]`  | Scores representing the model's predictions.        | | ||||
| | **RETURNS** | tuple               | The loss and the gradient, i.e. `(loss, gradient)`. | | ||||
| 
 | ||||
| ## EntityRecognizer.begin_training {#begin_training tag="method"} | ||||
| 
 | ||||
| Initialize the pipe for training, using data examples if available. If no model | ||||
| has been initialized yet, the model is added. | ||||
| Initialize the pipe for training, using data examples if available. Return an | ||||
| [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -181,12 +185,12 @@ has been initialized yet, the model is added. | |||
| > optimizer = ner.begin_training(pipeline=nlp.pipeline) | ||||
| > ``` | ||||
| 
 | ||||
| | Name          | Type     | Description                                                                                                                                                                                 | | ||||
| | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects.                                                                                           | | ||||
| | `pipeline`    | list     | Optional list of pipeline components that this component is part of.                                                                                                                        | | ||||
| | `sgd`         | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. | | ||||
| | **RETURNS**   | callable | An optimizer.                                                                                                                                                                               | | ||||
| | Name           | Type                    | Description                                                                                                                                                          | | ||||
| | -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `get_examples` | `Iterable[Example]`     | Optional gold-standard annotations in the form of [`Example`](/api/example) objects.                                                                                 | | ||||
| | `pipeline`     | `List[(str, callable)]` | Optional list of pipeline components that this component is part of.                                                                                                 | | ||||
| | `sgd`          | `Optimizer`             | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. | | ||||
| | **RETURNS**    | `Optimizer`             | An optimizer.                                                                                                                                                        | | ||||
| 
 | ||||
| ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} | ||||
| 
 | ||||
|  | @ -199,9 +203,9 @@ Create an optimizer for the pipeline component. | |||
| > optimizer = ner.create_optimizer() | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description    | | ||||
| | ----------- | -------- | -------------- | | ||||
| | **RETURNS** | callable | The optimizer. | | ||||
| | Name        | Type        | Description                                                     | | ||||
| | ----------- | ----------- | --------------------------------------------------------------- | | ||||
| | **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | | ||||
| 
 | ||||
| ## EntityRecognizer.use_params {#use_params tag="method, contextmanager"} | ||||
| 
 | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user