Merge remote-tracking branch 'origin/develop' into rliaw-develop

This commit is contained in:
Richard Liaw 2020-07-11 19:47:28 -07:00
commit 3bccf8b954
138 changed files with 5114 additions and 3110 deletions

View File

@ -5,16 +5,16 @@
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false gold_preproc = false
# Limitations on training document length or number of examples. # Limitations on training document length or number of examples.
max_length = 5000 max_length = 3000
limit = 0 limit = 0
# Data augmentation # Data augmentation
orth_variant_level = 0.0 orth_variant_level = 0.0
dropout = 0.2 dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 100000
max_epochs = 0 max_epochs = 0
max_steps = 20000 max_steps = 0
eval_frequency = 500 eval_frequency = 1000
# Other settings # Other settings
seed = 0 seed = 0
accumulate_gradient = 1 accumulate_gradient = 1
@ -26,6 +26,7 @@ score_weights = {"ents_f": 1.0}
init_tok2vec = null init_tok2vec = null
discard_oversize = false discard_oversize = false
omit_extra_lookups = false omit_extra_lookups = false
batch_by = "words"
[training.batch_size] [training.batch_size]
@schedules = "compounding.v1" @schedules = "compounding.v1"
@ -37,19 +38,13 @@ compound = 1.001
@optimizers = "Adam.v1" @optimizers = "Adam.v1"
beta1 = 0.9 beta1 = 0.9
beta2 = 0.999 beta2 = 0.999
L2_is_weight_decay = false L2_is_weight_decay = true
L2 = 1e-6 L2 = 0.01
grad_clip = 1.0 grad_clip = 1.0
use_averages = true use_averages = true
eps = 1e-8 eps = 1e-8
learn_rate = 0.001 learn_rate = 0.001
#[optimizer.learn_rate]
#@schedules = "warmup_linear.v1"
#warmup_steps = 250
#total_steps = 20000
#initial_rate = 0.001
[nlp] [nlp]
lang = "en" lang = "en"
vectors = null vectors = null
@ -58,8 +53,6 @@ vectors = null
factory = "ner" factory = "ner"
learn_tokens = false learn_tokens = false
min_action_freq = 1 min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.ner.model] [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"

View File

@ -1,8 +1,7 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a2" __version__ = "3.0.0a4"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
__projects__ = "https://github.com/explosion/spacy-boilerplates" __projects__ = "https://github.com/explosion/spacy-boilerplates"

View File

@ -11,12 +11,15 @@ from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401 from .train import train_cli # noqa: F401
from .pretrain import pretrain # noqa: F401 from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401 from .debug_data import debug_data # noqa: F401
from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401 from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401 from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401 from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401 from .validate import validate # noqa: F401
from .project import project_clone, project_assets, project_run # noqa: F401 from .project.clone import project_clone # noqa: F401
from .project import project_run_all # noqa: F401 from .project.assets import project_assets # noqa: F401
from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
DOCS: https://spacy.io/api/cli DOCS: https://spacy.io/api/cli
""" """
PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
project templates. You'd typically start by cloning a project template to a local
directory and fetching its assets like datasets etc. See the project's
project.yml for the available commands.
"""
app = typer.Typer(name=NAME, help=HELP) app = typer.Typer(name=NAME, help=HELP)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
# Wrappers for Typer's annotations. Initially created to set defaults and to # Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment. # keep the names short, but not needed at the moment.

View File

@ -120,8 +120,12 @@ def convert(
no_print=silent, no_print=silent,
ner_map=ner_map, ner_map=ner_map,
) )
if file_type == "json":
data = [docs_to_json(docs)]
else:
data = DocBin(docs=docs, store_user_data=True).to_bytes()
if output_dir == "-": if output_dir == "-":
_print_docs_to_stdout(docs, file_type) _print_docs_to_stdout(data, file_type)
else: else:
if input_loc != input_path: if input_loc != input_path:
subpath = input_loc.relative_to(input_path) subpath = input_loc.relative_to(input_path)
@ -129,24 +133,23 @@ def convert(
else: else:
output_file = Path(output_dir) / input_loc.parts[-1] output_file = Path(output_dir) / input_loc.parts[-1]
output_file = output_file.with_suffix(f".{file_type}") output_file = output_file.with_suffix(f".{file_type}")
_write_docs_to_file(docs, output_file, file_type) _write_docs_to_file(data, output_file, file_type)
msg.good(f"Generated output file ({len(docs)} documents): {output_file}") msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
def _print_docs_to_stdout(docs, output_type): def _print_docs_to_stdout(data, output_type):
if output_type == "json": if output_type == "json":
srsly.write_json("-", [docs_to_json(docs)]) srsly.write_json("-", data)
else: else:
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) sys.stdout.buffer.write(data)
def _write_docs_to_file(docs, output_file, output_type): def _write_docs_to_file(data, output_file, output_type):
if not output_file.parent.exists(): if not output_file.parent.exists():
output_file.parent.mkdir(parents=True) output_file.parent.mkdir(parents=True)
if output_type == "json": if output_type == "json":
srsly.write_json(output_file, [docs_to_json(docs)]) srsly.write_json(output_file, data)
else: else:
data = DocBin(docs=docs, store_user_data=True).to_bytes()
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)

168
spacy/cli/debug_model.py Normal file
View File

@ -0,0 +1,168 @@
from typing import List
from pathlib import Path
from wasabi import msg
from ._app import app, Arg, Opt
from .. import util
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from ..lang.en import English
@app.command("debug-model")
def debug_model_cli(
# fmt: off
config_path: Path = Arg(..., help="Path to config file", exists=True),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
# fmt: on
):
"""
Analyze a Thinc ML model - internal structure and activations during training
"""
print_settings = {
"dimensions": dimensions,
"parameters": parameters,
"gradients": gradients,
"attributes": attributes,
"layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
"print_before_training": P0,
"print_after_init": P1,
"print_after_training": P2,
"print_prediction": P3,
}
if seed is not None:
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info(f"Using CPU")
debug_model(
config_path,
print_settings=print_settings,
)
def debug_model(
config_path: Path,
*,
print_settings=None
):
if print_settings is None:
print_settings = {}
model = util.load_config(config_path, create_objects=True)["model"]
# STEP 0: Printing before training
msg.info(f"Analysing model with ID {model.id}")
if print_settings.get("print_before_training"):
msg.info(f"Before training:")
_print_model(model, print_settings)
# STEP 1: Initializing the model and printing again
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
if print_settings.get("print_after_init"):
msg.info(f"After initialization:")
_print_model(model, print_settings)
# STEP 2: Updating the model and printing again
optimizer = Adam(0.001)
set_dropout_rate(model, 0.2)
for e in range(3):
Y, get_dX = model.begin_update(_get_docs())
dY = get_gradient(model, Y)
_ = get_dX(dY)
model.finish_update(optimizer)
if print_settings.get("print_after_training"):
msg.info(f"After training:")
_print_model(model, print_settings)
# STEP 3: the final prediction
prediction = model.predict(_get_docs())
if print_settings.get("print_prediction"):
msg.info(f"Prediction:", str(prediction))
def get_gradient(model, Y):
goldY = _get_output(model.ops.xp)
return Y - goldY
def _sentences():
return [
"Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers",
"San Francisco considers banning sidewalk delivery robots",
"London is a big city in the United Kingdom.",
]
def _get_docs():
nlp = English()
return list(nlp.pipe(_sentences()))
def _get_output(xp):
return xp.asarray([xp.asarray([i+10, i+20, i+30], dtype="float32") for i, _ in enumerate(_get_docs())])
def _print_model(model, print_settings):
layers = print_settings.get("layers", "")
parameters = print_settings.get("parameters", False)
dimensions = print_settings.get("dimensions", False)
gradients = print_settings.get("gradients", False)
attributes = print_settings.get("attributes", False)
for i, node in enumerate(model.walk()):
if not layers or i in layers:
msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
if dimensions:
for name in node.dim_names:
if node.has_dim(name):
msg.info(f" - dim {name}: {node.get_dim(name)}")
else:
msg.info(f" - dim {name}: {node.has_dim(name)}")
if parameters:
for name in node.param_names:
if node.has_param(name):
print_value = _print_matrix(node.get_param(name))
msg.info(f" - param {name}: {print_value}")
else:
msg.info(f" - param {name}: {node.has_param(name)}")
if gradients:
for name in node.param_names:
if node.has_grad(name):
print_value = _print_matrix(node.get_grad(name))
msg.info(f" - grad {name}: {print_value}")
else:
msg.info(f" - grad {name}: {node.has_grad(name)}")
if attributes:
attrs = node.attrs
for name, value in attrs.items():
msg.info(f" - attr {name}: {value}")
def _print_matrix(value):
if value is None or isinstance(value, bool):
return value
result = str(value.shape) + " - sample: "
sample_matrix = value
for d in range(value.ndim-1):
sample_matrix = sample_matrix[0]
sample_matrix = sample_matrix[0:5]
result = result + str(sample_matrix)
return result

View File

@ -1,4 +1,4 @@
from typing import Optional, Sequence, Union from typing import Optional, Sequence
import requests import requests
import sys import sys
from wasabi import msg from wasabi import msg
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
from .. import about from .. import about
from ..util import is_package, get_base_version, run_command from ..util import is_package, get_base_version, run_command
# These are the old shortcuts we previously supported in spacy download. As of
# v3, shortcuts are deprecated so we're not expecting to add anything to this
# list. It only exists to show users warnings.
OLD_SHORTCUTS = {
"en": "en_core_web_sm",
"de": "de_core_news_sm",
"es": "es_core_news_sm",
"pt": "pt_core_news_sm",
"fr": "fr_core_news_sm",
"it": "it_core_news_sm",
"nl": "nl_core_news_sm",
"el": "el_core_news_sm",
"nb": "nb_core_news_sm",
"lt": "lt_core_news_sm",
"xx": "xx_ent_wiki_sm",
}
@app.command( @app.command(
"download", "download",
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
version = components[-1] version = components[-1]
download_model(dl_tpl.format(m=model_name, v=version), pip_args) download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else: else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = model
model_name = shortcuts.get(model, model) if model in OLD_SHORTCUTS:
msg.warn(
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
)
model_name = OLD_SHORTCUTS[model]
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model(dl_tpl.format(m=model_name, v=version), pip_args) download_model(dl_tpl.format(m=model_name, v=version), pip_args)
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
) )
def get_json(url: str, desc: str) -> Union[dict, list]: def get_compatibility() -> dict:
r = requests.get(url) version = get_base_version(about.__version__)
r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:
msg.fail( msg.fail(
f"Server error ({r.status_code})", f"Server error ({r.status_code})",
f"Couldn't fetch {desc}. Please find a model for your spaCy " f"Couldn't fetch compatibility table. Please find a model for your spaCy "
f"installation (v{about.__version__}), and download it manually. " f"installation (v{about.__version__}), and download it manually. "
f"For more details, see the documentation: " f"For more details, see the documentation: "
f"https://spacy.io/usage/models", f"https://spacy.io/usage/models",
exits=1, exits=1,
) )
return r.json() comp_table = r.json()
def get_compatibility() -> dict:
version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"] comp = comp_table["spacy"]
if version not in comp: if version not in comp:
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)

View File

@ -1,708 +0,0 @@
from typing import List, Dict, Any, Optional, Sequence
import typer
import srsly
from pathlib import Path
from wasabi import msg
import subprocess
import os
import re
import shutil
import sys
import requests
import tqdm
from ._app import app, Arg, Opt, COMMAND, NAME
from .. import about
from ..schemas import ProjectConfigSchema, validate
from ..util import ensure_path, run_command, make_tempdir, working_dir
from ..util import get_hash, get_checksum, split_command
CONFIG_FILE = "project.yml"
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
DIRS = [
"assets",
"metas",
"configs",
"packages",
"metrics",
"scripts",
"notebooks",
"training",
"corpus",
]
CACHES = [
Path.home() / ".torch",
Path.home() / ".caches" / "torch",
os.environ.get("TORCH_HOME"),
Path.home() / ".keras",
]
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
# it directly and edit the project.yml instead and re-run the project."""
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
templates. You'd typically start by cloning a project template to a local
directory and fetching its assets like datasets etc. See the project's
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
Version Control) to manage input and output files and to ensure steps are only
re-run if their inputs change.
"""
project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
@project_cli.callback(invoke_without_command=True)
def callback(ctx: typer.Context):
"""This runs before every project command and ensures DVC is installed."""
ensure_dvc()
################
# CLI COMMANDS #
################
@project_cli.command("clone")
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to fetch"),
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
# fmt: on
):
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized
(including using a private repo). Setting the --git flag will also
initialize the project directory as a Git repo. If the project is intended
to be a Git repo, it should be initialized with Git first, before
initializing DVC (Data Version Control). This allows DVC to integrate with
Git.
"""
if dest == Path.cwd():
dest = dest / name
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
@project_cli.command("init")
def project_init_cli(
# fmt: off
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
# fmt: on
):
"""Initialize a project directory with DVC and optionally Git. This should
typically be taken care of automatically when you run the "project clone"
command, but you can also run it separately. If the project is intended to
be a Git repo, it should be initialized with Git first, before initializing
DVC. This allows DVC to integrate with Git.
"""
project_init(path, git=git, force=force, silent=True)
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Use DVC (Data Version Control) to fetch project assets. Assets are
defined in the "assets" section of the project config. If possible, DVC
will try to track the files so you can pull changes from upstream. It will
also try and store the checksum so the assets are versioned. If the file
can't be tracked or checked, it will be downloaded without DVC. If a checksum
is provided in the project config, the file is only downloaded if no local
file with the same checksum exists.
"""
project_assets(project_dir)
@project_cli.command(
"run-all",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_all_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run all commands defined in the project. This command will use DVC and
the defined outputs and dependencies in the project config to determine
which steps need to be re-run and where to start. This means you're only
re-generating data if the inputs have changed.
This command calls into "dvc repro" and all additional arguments are passed
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
"""
if show_help:
print_run_help(project_dir)
else:
project_run_all(project_dir, *ctx.args)
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_cli(
# fmt: off
ctx: typer.Context,
subcommand: str = Arg(None, help="Name of command defined in project config"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run a named script defined in the project config. If the command is
part of the default pipeline defined in the "run" section, DVC is used to
determine whether the step should re-run if its inputs have changed, or
whether everything is up to date. If the script is not part of the default
pipeline, it will be called separately without DVC.
If DVC is used, the command calls into "dvc repro" and all additional
arguments are passed to the "dvc repro" command:
https://dvc.org/doc/command-reference/repro
"""
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
project_run(project_dir, subcommand, *ctx.args)
@project_cli.command("exec", hidden=True)
def project_exec_cli(
# fmt: off
subcommand: str = Arg(..., help="Name of command defined in project config"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Execute a command defined in the project config. This CLI command is
only called internally in auto-generated DVC pipelines, as a shortcut for
multi-step commands in the project config. You typically shouldn't have to
call it yourself. To run a command, call "run" or "run-all".
"""
project_exec(project_dir, subcommand)
@project_cli.command("update-dvc")
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
"""Update the auto-generated DVC config file. Uses the steps defined in the
"run" section of the project config. This typically happens automatically
when running a command, but can also be triggered manually if needed.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
if updated:
msg.good(f"Updated DVC config from {CONFIG_FILE}")
else:
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
app.add_typer(project_cli, name="project")
#################
# CLI FUNCTIONS #
#################
def project_clone(
name: str,
dest: Path,
*,
repo: str = about.__projects__,
git: bool = False,
no_init: bool = False,
) -> None:
"""Clone a project template from a repository.
name (str): Name of subdirectory to clone.
dest (Path): Destination path of cloned project.
repo (str): URL of Git repo containing project templates.
git (bool): Initialize project as Git repo. Should be set to True if project
is intended as a repo, since it will allow DVC to integrate with Git.
no_init (bool): Don't initialize DVC and Git automatically. If True, the
"init" command or "git init" and "dvc init" need to be run manually.
"""
dest = ensure_path(dest)
check_clone(name, dest, repo)
project_dir = dest.resolve()
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
try:
run_command(cmd)
except SystemExit:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name)
try:
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
except SystemExit:
err = f"Could not clone '{name}' in the repo '{repo}'."
msg.fail(err)
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
for sub_dir in DIRS:
dir_path = project_dir / sub_dir
if not dir_path.exists():
dir_path.mkdir(parents=True)
if not no_init:
project_init(project_dir, git=git, force=True, silent=True)
msg.good(f"Your project is now ready!", dest)
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
def project_init(
project_dir: Path,
*,
git: bool = False,
force: bool = False,
silent: bool = False,
analytics: bool = False,
):
"""Initialize a project as a DVC and (optionally) as a Git repo.
project_dir (Path): Path to project directory.
git (bool): Also call "git init" to initialize directory as a Git repo.
silent (bool): Don't print any output (via DVC).
analytics (bool): Opt-in to DVC analytics (defaults to False).
"""
with working_dir(project_dir) as cwd:
if git:
run_command(["git", "init"])
init_cmd = ["dvc", "init"]
if silent:
init_cmd.append("--quiet")
if not git:
init_cmd.append("--no-scm")
if force:
init_cmd.append("--force")
run_command(init_cmd)
# We don't want to have analytics on by default our users should
# opt-in explicitly. If they want it, they can always enable it.
if not analytics:
run_command(["dvc", "config", "core.analytics", "false"])
# Remove unused and confusing plot templates from .dvc directory
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
# once you commit your changes via Git and it creates a bunch of files
# that have no purpose
plots_dir = cwd / DVC_DIR / "plots"
if plots_dir.exists():
shutil.rmtree(str(plots_dir))
config = load_project_config(cwd)
setup_check_dvc(cwd, config)
def project_assets(project_dir: Path) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path)
setup_check_dvc(project_path, config)
assets = config.get("assets", {})
if not assets:
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)")
variables = config.get("variables", {})
fetched_assets = []
for asset in assets:
url = asset["url"].format(**variables)
dest = asset["dest"].format(**variables)
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
if fetched_path:
fetched_assets.append(str(fetched_path))
if fetched_assets:
with working_dir(project_path):
run_command(["dvc", "add", *fetched_assets, "--external"])
def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> Optional[Path]:
"""Fetch an asset from a given URL or path. Will try to import the file
using DVC's import-url if possible (fully tracked and versioned) and falls
back to get-url (versioned) and a non-DVC download if necessary. If a
checksum is provided and a local file exists, it's only re-downloaded if the
checksum doesn't match.
project_path (Path): Path to project directory.
url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file.
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
"""
url = convert_asset_url(url)
dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
# TODO: add support for caches (dvc import-url with local path)
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
return dest_path
with working_dir(project_path):
try:
# If these fail, we don't want to output an error or info message.
# Try with tracking the source first, then just downloading with
# DVC, then a regular non-DVC download.
try:
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
except subprocess.CalledProcessError:
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
except subprocess.CalledProcessError:
try:
download_file(url, dest_path)
except requests.exceptions.HTTPError as e:
msg.fail(f"Download failed: {dest}", e)
return None
if checksum and checksum != get_checksum(dest_path):
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
msg.good(f"Fetched asset {dest}")
return dest_path
def project_run_all(project_dir: Path, *dvc_args) -> None:
"""Run all commands defined in the project using DVC.
project_dir (Path): Path to project directory.
*dvc_args: Other arguments passed to "dvc repro".
"""
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
dvc_cmd = ["dvc", "repro", *dvc_args]
with working_dir(project_dir):
run_command(dvc_cmd)
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
"""Simulate a CLI help prompt using the info available in the project config.
project_dir (Path): The project directory.
subcommand (Optional[str]): The subcommand or None. If a subcommand is
provided, the subcommand help is shown. Otherwise, the top-level help
and a list of available commands is printed.
"""
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
config_commands = config.get("commands", [])
commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand:
validate_subcommand(commands.keys(), subcommand)
print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
help_text = commands[subcommand].get("help")
if help_text:
msg.text(f"\n{help_text}\n")
else:
print(f"\nAvailable commands in {CONFIG_FILE}")
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
msg.text("Run all commands defined in the 'run' block of the project config:")
print(f"{COMMAND} project run-all {project_dir}")
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
"""Run a named script defined in the project config. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
execute the command, so it can determine whether to rerun it. It then
calls into "exec" to execute it.
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
*dvc_args: Other arguments passed to "dvc repro".
"""
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
config_commands = config.get("commands", [])
variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config_commands}
validate_subcommand(commands.keys(), subcommand)
if subcommand in config.get("run", []):
# This is one of the pipeline commands tracked in DVC
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
with working_dir(project_dir):
run_command(dvc_cmd)
else:
cmd = commands[subcommand]
# Deps in non-DVC commands aren't tracked, but if they're defined,
# make sure they exist before running the command
for dep in cmd.get("deps", []):
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
msg.fail(err, exits=1)
with working_dir(project_dir):
run_commands(cmd["script"], variables)
def project_exec(project_dir: Path, subcommand: str):
"""Execute a command defined in the project config.
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
"""
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config_commands}
with working_dir(project_dir):
run_commands(commands[subcommand]["script"], variables)
###########
# HELPERS #
###########
def load_project_config(path: Path) -> Dict[str, Any]:
"""Load the project config file from a directory and validate it.
path (Path): The path to the project directory.
RETURNS (Dict[str, Any]): The loaded project config.
"""
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
invalid_err = f"Invalid project config in {CONFIG_FILE}"
try:
config = srsly.read_yaml(config_path)
except ValueError as e:
msg.fail(invalid_err, e, exits=1)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(invalid_err, "\n".join(errors), exits=1)
return config
def update_dvc_config(
path: Path,
config: Dict[str, Any],
verbose: bool = False,
silent: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
project directory. The file is auto-generated based on the config. The
first line of the auto-generated file specifies the hash of the config
dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project config.
verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
config_hash = get_hash(config)
path = path.resolve()
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
return False # Nothing has changed in project config, don't need to update
dvc_config_path.unlink()
variables = config.get("variables", {})
commands = []
# We only want to include commands that are part of the main list of "run"
# commands in project.yml and should be run in sequence
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in config.get("run", []):
validate_subcommand(config_commands.keys(), name)
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])
outputs_no_cache = command.get("outputs_no_cache", [])
if not deps and not outputs and not outputs_no_cache:
continue
# Default to the working dir as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "exec", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
if verbose:
dvc_cmd.append("--verbose")
if silent:
dvc_cmd.append("--quiet")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
commands.append(" ".join(full_cmd))
with working_dir(path):
run_commands(commands, variables, silent=True)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
return True
def ensure_dvc() -> None:
"""Ensure that the "dvc" command is available and show an error if not."""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
"""Check that the project is set up correctly with DVC and update its
config if needed. Will raise an error if the project is not an initialized
DVC project.
project_dir (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project config.
"""
if not project_dir.exists():
msg.fail(f"Can't find project directory: {project_dir}")
if not (project_dir / ".dvc").exists():
msg.fail(
"Project not initialized as a DVC project.",
f"Make sure that the project template was cloned correctly. To "
f"initialize the project directory manually, you can run: "
f"{COMMAND} project init {project_dir}",
exits=1,
)
with msg.loading("Updating DVC config..."):
updated = update_dvc_config(project_dir, config, silent=True)
if updated:
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
def run_commands(
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The string commands.
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (bool): Don't print the commands.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]]
if not silent:
print(f"Running command: {' '.join(command)}")
run_command(command)
def convert_asset_url(url: str) -> str:
"""Check and convert the asset URL if needed.
url (str): The asset URL.
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if re.match("(http(s?)):\/\/github.com", url):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(
"Downloading from a regular GitHub URL. This will only download "
"the source of the page, not the actual file. Converting the URL "
"to a raw URL.",
converted,
)
return converted
return url
def check_clone(name: str, dest: Path, repo: str) -> None:
"""Check and validate that the destination path can be used to clone. Will
check that Git is available and that the destination path is suitable.
name (str): Name of the directory to clone from the repo.
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
try:
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
f"Cloning spaCy project templates requires Git and the 'git' command. ",
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually and then run:",
f"{COMMAND} project init {dest}",
exits=1,
)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
# Directory already exists (not allowed, clone needs to create it)
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
if not dest.parent.exists():
# We're not creating parents, parent dir should exist
msg.fail(
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
exits=1,
)
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
"""Check that a subcommand is valid and defined. Raises an error otherwise.
commands (Sequence[str]): The available commands.
subcommand (str): The subcommand.
"""
if subcommand not in commands:
msg.fail(
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
f"Available commands: {', '.join(commands)}",
exits=1,
)
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
"""Download a file using requests.
url (str): The URL of the file.
dest (Path): The destination path.
chunk_size (int): The size of chunks to read/write.
"""
response = requests.get(url, stream=True)
response.raise_for_status()
total = int(response.headers.get("content-length", 0))
progress_settings = {
"total": total,
"unit": "iB",
"unit_scale": True,
"unit_divisor": chunk_size,
"leave": False,
}
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
for data in response.iter_content(chunk_size=chunk_size):
size = f.write(data)
bar.update(size)

View File

158
spacy/cli/project/assets.py Normal file
View File

@ -0,0 +1,158 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import requests
import tqdm
import re
import shutil
from ...util import ensure_path, working_dir
from .._app import project_cli, Arg
from .util import PROJECT_FILE, load_project_config, get_checksum
# TODO: find a solution for caches
# CACHES = [
# Path.home() / ".torch",
# Path.home() / ".caches" / "torch",
# os.environ.get("TORCH_HOME"),
# Path.home() / ".keras",
# ]
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Fetch project assets like datasets and pretrained weights. Assets are
defined in the "assets" section of the project.yml. If a checksum is
provided in the project.yml, the file is only downloaded if no local file
with the same checksum exists.
"""
project_assets(project_dir)
def project_assets(project_dir: Path) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path)
assets = config.get("assets", {})
if not assets:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)")
variables = config.get("variables", {})
for asset in assets:
dest = asset["dest"].format(**variables)
url = asset.get("url")
checksum = asset.get("checksum")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)
continue
url = url.format(**variables)
fetch_asset(project_path, url, dest, checksum)
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
"""Check and validate assets without a URL (private assets that the user
has to provide themselves) and give feedback about the checksum.
dest (Path): Desintation path of the asset.
checksum (Optional[str]): Optional checksum of the expected file.
"""
if not Path(dest).exists():
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
msg.warn(err)
else:
if checksum and checksum == get_checksum(dest):
msg.good(f"Asset exists with matching checksum: {dest}")
else:
msg.fail(f"Asset available but with incorrect checksum: {dest}")
def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None:
"""Fetch an asset from a given URL or path. If a checksum is provided and a
local file exists, it's only re-downloaded if the checksum doesn't match.
project_path (Path): Path to project directory.
url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file.
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
"""
# TODO: add support for caches
dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
return dest_path
# We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists():
dest_path.parent.mkdir(parents=True)
with working_dir(project_path):
url = convert_asset_url(url)
try:
download_file(url, dest_path)
msg.good(f"Downloaded asset {dest}")
except requests.exceptions.RequestException as e:
if Path(url).exists() and Path(url).is_file():
# If it's a local file, copy to destination
shutil.copy(url, str(dest_path))
msg.good(f"Copied local asset {dest}")
else:
msg.fail(f"Download failed: {dest}", e)
return
if checksum and checksum != get_checksum(dest_path):
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
def convert_asset_url(url: str) -> str:
"""Check and convert the asset URL if needed.
url (str): The asset URL.
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if re.match(r"(http(s?)):\/\/github.com", url):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(
"Downloading from a regular GitHub URL. This will only download "
"the source of the page, not the actual file. Converting the URL "
"to a raw URL.",
converted,
)
return converted
return url
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
"""Download a file using requests.
url (str): The URL of the file.
dest (Path): The destination path.
chunk_size (int): The size of chunks to read/write.
"""
response = requests.get(url, stream=True)
response.raise_for_status()
total = int(response.headers.get("content-length", 0))
progress_settings = {
"total": total,
"unit": "iB",
"unit_scale": True,
"unit_divisor": chunk_size,
"leave": False,
}
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
for data in response.iter_content(chunk_size=chunk_size):
size = f.write(data)
bar.update(size)

View File

@ -0,0 +1,97 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import subprocess
import shutil
import re
from ... import about
from ...util import ensure_path, run_command, make_tempdir
from .._app import project_cli, Arg, Opt, COMMAND
from .util import PROJECT_FILE
@project_cli.command("clone")
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
# fmt: on
):
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized
(including using a private repo).
"""
if dest is None:
dest = Path.cwd() / name
project_clone(name, dest, repo=repo)
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
"""Clone a project template from a repository.
name (str): Name of subdirectory to clone.
dest (Path): Destination path of cloned project.
repo (str): URL of Git repo containing project templates.
"""
dest = ensure_path(dest)
check_clone(name, dest, repo)
project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
try:
run_command(cmd)
except subprocess.CalledProcessError:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name)
try:
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(name)), str(project_dir))
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")
else:
msg.good(f"Your project is now ready!")
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
def check_clone(name: str, dest: Path, repo: str) -> None:
"""Check and validate that the destination path can be used to clone. Will
check that Git is available and that the destination path is suitable.
name (str): Name of the directory to clone from the repo.
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
try:
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
f"Cloning spaCy project templates requires Git and the 'git' command. ",
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually and then run:",
f"{COMMAND} project init {dest}",
exits=1,
)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
# Directory already exists (not allowed, clone needs to create it)
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
if not dest.parent.exists():
# We're not creating parents, parent dir should exist
msg.fail(
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
exits=1,
)

208
spacy/cli/project/dvc.py Normal file
View File

@ -0,0 +1,208 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional
import subprocess
from pathlib import Path
from wasabi import msg
from .util import PROJECT_FILE, load_project_config, get_hash
from .._app import project_cli, Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
UPDATE_COMMAND = "dvc"
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
# edited your {PROJECT_FILE}, you can regenerate this file by running:
# {COMMAND} project {UPDATE_COMMAND}"""
@project_cli.command(UPDATE_COMMAND)
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
"""Auto-generate Data Version Control (DVC) config. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. If no workflow is specified, the first defined
workflow is used. The DVC config will only be updated if the project.yml changed.
"""
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
def project_update_dvc(
project_dir: Path,
workflow: Optional[str] = None,
*,
verbose: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. Will only update the file if the checksum changed.
project_dir (Path): The project directory.
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
else:
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
def update_dvc_config(
path: Path,
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
silent: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
project directory. The file is auto-generated based on the config. The
first line of the auto-generated file specifies the hash of the config
dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
ensure_dvc(path)
workflows = config.get("workflows", {})
workflow_names = list(workflows.keys())
check_workflows(workflow_names, workflow)
if not workflow:
workflow = workflow_names[0]
config_hash = get_hash(config)
path = path.resolve()
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
return False # Nothing has changed in project.yml, don't need to update
dvc_config_path.unlink()
variables = config.get("variables", {})
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])
outputs_no_cache = command.get("outputs_no_cache", [])
if not deps and not outputs and not outputs_no_cache:
continue
# Default to the working dir as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "run", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent}
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
return True
def run_dvc_commands(
commands: List[str] = tuple(),
variables: Dict[str, str] = {},
flags: Dict[str, bool] = {},
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
run_command(dvc_command)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.
workflows (List[str]): Names of the available workflows.
workflow (Optional[str]): The name of the workflow to convert.
"""
if not workflows:
msg.fail(
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
f"define at least one list of commands.",
exits=1,
)
if workflow is not None and workflow not in workflows:
msg.fail(
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
f"Available workflows: {', '.join(workflows)}",
exits=1,
)
if not workflow:
msg.warn(
f"No workflow specified for DVC pipeline. Using the first workflow "
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
)
def ensure_dvc(project_dir: Path) -> None:
"""Ensure that the "dvc" command is available and that the current project
directory is an initialized DVC project.
"""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"To use spaCy projects with DVC (Data Version Control), DVC needs "
"to be installed and the 'dvc' command needs to be available",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
if not (project_dir / ".dvc").exists():
msg.fail(
"Project not initialized as a DVC project",
"To initialize a DVC project, you can run 'dvc init' in the project "
"directory. For more details, see the documentation: "
"https://dvc.org/doc/command-reference/init",
exits=1,
)

266
spacy/cli/project/run.py Normal file
View File

@ -0,0 +1,266 @@
from typing import Optional, List, Dict, Sequence, Any
from pathlib import Path
from wasabi import msg
import sys
import srsly
from ...util import working_dir, run_command, split_command, is_cwd, join_command
from .._app import project_cli, Arg, Opt, COMMAND
from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .util import get_checksum
@project_cli.command("run")
def project_run_cli(
# fmt: off
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run a named command or workflow defined in the project.yml. If a workflow
name is specified, all commands in the workflow are run, in order. If
commands define dependencies and/or outputs, they will only be re-run if
state has changed.
"""
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
project_run(project_dir, subcommand, force=force, dry=dry)
def project_run(
project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
) -> None:
"""Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
execute the command, so it can determine whether to rerun it. It then
calls into "exec" to execute it.
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
force (bool): Force re-running, even if nothing changed.
dry (bool): Perform a dry run and don't execute commands.
"""
config = load_project_config(project_dir)
variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
for cmd in workflows[subcommand]:
project_run(project_dir, cmd, force=force, dry=dry)
else:
cmd = commands[subcommand]
variables = config.get("variables", {})
for dep in cmd.get("deps", []):
dep = dep.format(**variables)
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, **err_kwargs)
with working_dir(project_dir) as current_dir:
rerun = check_rerun(current_dir, cmd, variables)
if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed")
else:
msg.divider(subcommand)
run_commands(cmd["script"], variables, dry=dry)
if not dry:
update_lockfile(current_dir, cmd, variables)
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
"""Simulate a CLI help prompt using the info available in the project.yml.
project_dir (Path): The project directory.
subcommand (Optional[str]): The subcommand or None. If a subcommand is
provided, the subcommand help is shown. Otherwise, the top-level help
and a list of available commands is printed.
"""
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
commands = {cmd["name"]: cmd for cmd in config_commands}
workflows = config.get("workflows", {})
project_loc = "" if is_cwd(project_dir) else project_dir
if subcommand:
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
if subcommand in commands:
help_text = commands[subcommand].get("help")
if help_text:
print(f"\n{help_text}\n")
elif subcommand in workflows:
steps = workflows[subcommand]
print(f"\nWorkflow consisting of {len(steps)} commands:")
steps_data = [
(f"{i + 1}. {step}", commands[step].get("help", ""))
for i, step in enumerate(steps)
]
msg.table(steps_data)
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
print(f"For command details, run: {help_cmd}")
else:
print("")
if config_commands:
print(f"Available commands in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
if workflows:
print(f"Available workflows in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
def run_commands(
commands: List[str] = tuple(),
variables: Dict[str, Any] = {},
silent: bool = False,
dry: bool = False,
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The string commands.
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (bool): Don't print the commands.
dry (bool): Perform a dry run and don't execut anything.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]]
if not silent:
print(f"Running command: {join_command(command)}")
if not dry:
run_command(command)
def validate_subcommand(
commands: Sequence[str], workflows: Sequence[str], subcommand: str
) -> None:
"""Check that a subcommand is valid and defined. Raises an error otherwise.
commands (Sequence[str]): The available commands.
subcommand (str): The subcommand.
"""
if not commands and not workflows:
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
if subcommand not in commands and subcommand not in workflows:
help_msg = []
if commands:
help_msg.append(f"Available commands: {', '.join(commands)}")
if workflows:
help_msg.append(f"Available workflows: {', '.join(workflows)}")
msg.fail(
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
". ".join(help_msg),
exits=1,
)
def check_rerun(
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs
changed.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
variables (Dict[str, Any]): The variables defined in the project.yml.
RETURNS (bool): Whether to re-run the command.
"""
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists(): # We don't have a lockfile, run command
return True
data = srsly.read_yaml(lock_path)
if command["name"] not in data: # We don't have info about this command
return True
entry = data[command["name"]]
# Always run commands with no outputs (otherwise they'd always be skipped)
if not entry.get("outs", []):
return True
# If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
def update_lockfile(
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
) -> None:
"""Update the lockfile after running a command. Will create a lockfile if
it doesn't yet exist and will add an entry for the current command, its
script and dependencies/outputs.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
variables (Dict[str, Any]): The variables defined in the project.yml.
"""
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists():
srsly.write_yaml(lock_path, {})
data = {}
else:
data = srsly.read_yaml(lock_path)
data[command["name"]] = get_lock_entry(project_dir, command, variables)
srsly.write_yaml(lock_path, data)
def get_lock_entry(
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
) -> Dict[str, Any]:
"""Get a lockfile entry for a given command. An entry includes the command,
the script (command steps) and a list of dependencies and outputs with
their paths and file hashes, if available. The format is based on the
dvc.lock files, to keep things consistent.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
variables (Dict[str, Any]): The variables defined in the project.yml.
RETURNS (Dict[str, Any]): The lockfile entry.
"""
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
return {
"cmd": f"{COMMAND} run {command['name']}",
"script": command["script"],
"deps": deps,
"outs": [*outs, *outs_nc],
}
def get_fileinfo(
project_dir: Path, paths: List[str], variables: Dict[str, Any]
) -> List[Dict[str, str]]:
"""Generate the file information for a list of paths (dependencies, outputs).
Includes the file path and the file's checksum.
project_dir (Path): The current project directory.
paths (List[str]): The file paths.
variables (Dict[str, Any]): The variables defined in the project.yml.
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
"""
data = []
for path in paths:
path = path.format(**variables)
file_path = project_dir / path
md5 = get_checksum(file_path) if file_path.exists() else None
data.append({"path": path, "md5": md5})
return data

93
spacy/cli/project/util.py Normal file
View File

@ -0,0 +1,93 @@
from typing import Dict, Any, Union
from pathlib import Path
from wasabi import msg
import srsly
import hashlib
from ...schemas import ProjectConfigSchema, validate
PROJECT_FILE = "project.yml"
PROJECT_LOCK = "project.lock"
def load_project_config(path: Path) -> Dict[str, Any]:
"""Load the project.yml file from a directory and validate it. Also make
sure that all directories defined in the config exist.
path (Path): The path to the project directory.
RETURNS (Dict[str, Any]): The loaded project.yml.
"""
config_path = path / PROJECT_FILE
if not config_path.exists():
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
try:
config = srsly.read_yaml(config_path)
except ValueError as e:
msg.fail(invalid_err, e, exits=1)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(invalid_err, "\n".join(errors), exits=1)
validate_project_commands(config)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
return config
def validate_project_commands(config: Dict[str, Any]) -> None:
"""Check that project commands and workflows are valid, don't contain
duplicates, don't clash and only refer to commands that exist.
config (Dict[str, Any]): The loaded config.
"""
command_names = [cmd["name"] for cmd in config.get("commands", [])]
workflows = config.get("workflows", {})
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
if duplicates:
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
msg.fail(err, exits=1)
for workflow_name, workflow_steps in workflows.items():
if workflow_name in command_names:
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
msg.fail(err, exits=1)
for step in workflow_steps:
if step not in command_names:
msg.fail(
f"Unknown command specified in workflow '{workflow_name}': {step}",
f"Workflows can only refer to commands defined in the 'commands' "
f"section of the {PROJECT_FILE}.",
exits=1,
)
def get_hash(data) -> str:
"""Get the hash for a JSON-serializable object.
data: The data to hash.
RETURNS (str): The hash.
"""
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest()
def get_checksum(path: Union[Path, str]) -> str:
"""Get the checksum for a file or directory given its file path. If a
directory path is provided, this uses all files in that directory.
path (Union[Path, str]): The file or directory path.
RETURNS (str): The checksum.
"""
path = Path(path)
if path.is_file():
return hashlib.md5(Path(path).read_bytes()).hexdigest()
if path.is_dir():
# TODO: this is currently pretty slow
dir_checksum = hashlib.md5()
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
dir_checksum.update(sub_file.read_bytes())
return dir_checksum.hexdigest()
raise ValueError(f"Can't get checksum for {path}: not a file or directory")

View File

@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
@app.command("train") @app.command("train")
def train_cli( def train_cli(
# fmt: off # fmt: off
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), train_path: Path = Arg(..., help="Location of training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), dev_path: Path = Arg(..., help="Location of development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
num_workers: int = Opt(None, "-j", help="Parallel Workers"), num_workers: int = Opt(None, "-j", help="Parallel Workers"),
strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"), strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"),
@ -155,6 +155,7 @@ def train_cli(
if init_tok2vec is not None: if init_tok2vec is not None:
with init_tok2vec.open("rb") as file_: with init_tok2vec.open("rb") as file_:
weights_data = file_.read() weights_data = file_.read()
train_args = dict( train_args = dict(
config_path=config_path, config_path=config_path,
data_paths={"train": train_path, "dev": dev_path}, data_paths={"train": train_path, "dev": dev_path},
@ -170,7 +171,7 @@ def train_cli(
distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args) distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args)
else: else:
if use_gpu >= 0: if use_gpu >= 0:
msg.info(f"Using GPU: {str(use_gpu)}") msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu) require_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
@ -191,7 +192,8 @@ def train(
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config # Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False) config = util.load_config(config_path, create_objects=False)
fix_random_seed(config["training"]["seed"]) if config["training"].get("seed"):
fix_random_seed(config["training"]["seed"])
if config["training"].get("use_pytorch_for_gpu_memory"): if config["training"].get("use_pytorch_for_gpu_memory"):
# It feels kind of weird to not have a default for this. # It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
@ -216,7 +218,10 @@ def train(
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
train_examples = list( train_examples = list(
corpus.train_dataset( corpus.train_dataset(
nlp, shuffle=False, gold_preproc=training["gold_preproc"] nlp,
shuffle=False,
gold_preproc=training["gold_preproc"],
max_length=training["max_length"],
) )
) )
nlp.begin_training(lambda: train_examples) nlp.begin_training(lambda: train_examples)
@ -315,6 +320,7 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
) )
epoch = 0 epoch = 0
batch_strategy = cfg.get("batch_by", "sequences")
while True: while True:
if len(train_examples) == 0: if len(train_examples) == 0:
raise ValueError(Errors.E988) raise ValueError(Errors.E988)
@ -324,11 +330,22 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
random.random() random.random()
random.shuffle(train_examples) random.shuffle(train_examples)
epoch += 1 epoch += 1
batches = util.minibatch_by_words( if batch_strategy == "padded":
train_examples, batches = util.minibatch_by_padded_size(
size=cfg["batch_size"], train_examples,
discard_oversize=cfg["discard_oversize"], size=cfg["batch_size"],
) buffer=256,
discard_oversize=cfg["discard_oversize"],
)
elif batch_strategy == "words":
batches = util.minibatch_by_words(
train_examples,
size=cfg["batch_size"],
discard_oversize=cfg["discard_oversize"],
)
else:
batches = util.minibatch(train_examples, size=cfg["batch_size"])
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try: try:
first = next(batches) first = next(batches)
@ -440,7 +457,9 @@ def train_while_improving(
if raw_text: if raw_text:
random.shuffle(raw_text) random.shuffle(raw_text)
raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text] raw_examples = [
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
]
raw_batches = util.minibatch(raw_examples, size=8) raw_batches = util.minibatch(raw_examples, size=8)
for step, (epoch, batch) in enumerate(train_data): for step, (epoch, batch) in enumerate(train_data):

View File

@ -69,6 +69,9 @@ class Warnings(object):
W027 = ("Found a large training file of {size} bytes. Note that it may " W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple " "be more efficient to split your training data into multiple "
"smaller JSON files instead.") "smaller JSON files instead.")
W028 = ("Doc.from_array was called with a vector of type '{type}', "
"but is expecting one of type 'uint64' instead. This may result "
"in problems with the vocab further on in the pipeline.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with " W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use " "entities \"{entities}\". Use "
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
@ -477,15 +480,14 @@ class Errors(object):
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
"array and {doc_length} for the Doc itself.") "array and {doc_length} for the Doc itself.")
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
E973 = ("Unexpected type for NER data") E973 = ("Unexpected type for NER data")
E974 = ("Unknown {obj} attribute: {key}") E974 = ("Unknown {obj} attribute: {key}")
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
"but got {type}")
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
"but received None.") "but received None.")
E977 = ("Can not compare a MorphAnalysis with a string object. " E977 = ("Can not compare a MorphAnalysis with a string object. "
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")

View File

@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
cdef class Example: cdef class Example:
def __init__(self, Doc predicted, Doc reference, *, alignment=None): def __init__(self, Doc predicted, Doc reference, *, alignment=None):
""" Doc can either be text, or an actual Doc """
if predicted is None: if predicted is None:
raise TypeError(Errors.E972.format(arg="predicted")) raise TypeError(Errors.E972.format(arg="predicted"))
if reference is None: if reference is None:
@ -37,6 +36,9 @@ cdef class Example:
self.y = reference self.y = reference
self._alignment = alignment self._alignment = alignment
def __len__(self):
return len(self.predicted)
property predicted: property predicted:
def __get__(self): def __get__(self):
return self.x return self.x
@ -59,17 +61,15 @@ cdef class Example:
@classmethod @classmethod
def from_dict(cls, Doc predicted, dict example_dict): def from_dict(cls, Doc predicted, dict example_dict):
if predicted is None:
raise ValueError(Errors.E976.format(n="first", type="Doc"))
if example_dict is None: if example_dict is None:
raise ValueError(Errors.E976) raise ValueError(Errors.E976.format(n="second", type="dict"))
if not isinstance(predicted, Doc):
raise TypeError(Errors.E975.format(type=type(predicted)))
example_dict = _fix_legacy_dict_data(example_dict) example_dict = _fix_legacy_dict_data(example_dict)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
if not _has_field(tok_dict, "SPACY"):
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -257,7 +257,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([vocab.morphology.add(v) for v in value]) values.append([vocab.morphology.add(v) for v in value])
else: else:
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(v) for v in value]) try:
values.append([vocab.strings.add(v) for v in value])
except TypeError:
types= set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types))
array = numpy.asarray(values, dtype="uint64") array = numpy.asarray(values, dtype="uint64")
return attrs, array.T return attrs, array.T
@ -325,8 +329,8 @@ def _fix_legacy_dict_data(example_dict):
for key, value in old_token_dict.items(): for key, value in old_token_dict.items():
if key in ("text", "ids", "brackets"): if key in ("text", "ids", "brackets"):
pass pass
elif key in remapping: elif key.lower() in remapping:
token_dict[remapping[key]] = value token_dict[remapping[key.lower()]] = value
else: else:
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
text = example_dict.get("text", example_dict.get("raw")) text = example_dict.get("text", example_dict.get("raw"))

View File

@ -513,20 +513,23 @@ class Language(object):
): ):
"""Update the models in the pipeline. """Update the models in the pipeline.
examples (iterable): A batch of `Example` objects. examples (Iterable[Example]): A batch of examples
dummy: Should not be set - serves to catch backwards-incompatible scripts. dummy: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate. drop (float): The dropout rate.
sgd (callable): An optimizer. sgd (Optimizer): An optimizer.
losses (dict): Dictionary to update with the loss, keyed by component. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
component_cfg (dict): Config parameters for specific pipeline component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
components, keyed by component name. components, keyed by component name.
RETURNS (Dict[str, float]): The updated losses dictionary
DOCS: https://spacy.io/api/language#update DOCS: https://spacy.io/api/language#update
""" """
if dummy is not None: if dummy is not None:
raise ValueError(Errors.E989) raise ValueError(Errors.E989)
if losses is None:
losses = {}
if len(examples) == 0: if len(examples) == 0:
return return losses
if not isinstance(examples, Iterable): if not isinstance(examples, Iterable):
raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples))) raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples)))
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
@ -540,22 +543,19 @@ class Language(object):
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
component_deps = count_pipeline_interdependencies(self.pipeline)
# Determine whether component should set annotations. In theory I guess
# we should do this by inspecting the meta? Or we could just always
# say "yes"
for i, (name, proc) in enumerate(self.pipeline): for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {}) component_cfg.setdefault(name, {})
component_cfg[name].setdefault("drop", drop) component_cfg[name].setdefault("drop", drop)
component_cfg[name]["set_annotations"] = bool(component_deps[i]) component_cfg[name].setdefault("set_annotations", False)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if not hasattr(proc, "update"): if not hasattr(proc, "update"):
continue continue
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
if sgd is not False: if sgd not in (None, False):
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "model"): if hasattr(proc, "model"):
proc.model.finish_update(sgd) proc.model.finish_update(sgd)
return losses
def rehearse(self, examples, sgd=None, losses=None, config=None): def rehearse(self, examples, sgd=None, losses=None, config=None):
"""Make a "rehearsal" update to the models in the pipeline, to prevent """Make a "rehearsal" update to the models in the pipeline, to prevent
@ -761,18 +761,17 @@ class Language(object):
): ):
"""Process texts as a stream, and yield `Doc` objects in order. """Process texts as a stream, and yield `Doc` objects in order.
texts (iterator): A sequence of texts to process. texts (Iterable[str]): A sequence of texts to process.
as_tuples (bool): If set to True, inputs should be a sequence of as_tuples (bool): If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of (text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False. (doc, context) tuples. Defaults to False.
batch_size (int): The number of texts to buffer. batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable. disable (List[str]): Names of the pipeline components to disable.
cleanup (bool): If True, unneeded strings are freed to control memory cleanup (bool): If True, unneeded strings are freed to control memory
use. Experimental. use. Experimental.
component_cfg (dict): An optional dictionary with extra keyword component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
arguments for specific components. arguments for specific components.
n_process (int): Number of processors to process texts, only supported n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
in Python3. If -1, set `multiprocessing.cpu_count()`.
YIELDS (Doc): Documents in the order of the original text. YIELDS (Doc): Documents in the order of the original text.
DOCS: https://spacy.io/api/language#pipe DOCS: https://spacy.io/api/language#pipe

View File

@ -1,13 +1,14 @@
from thinc.api import Model, normal_init from thinc.api import Model, normal_init
def PrecomputableAffine(nO, nI, nF, nP): def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
model = Model( model = Model(
"precomputable_affine", "precomputable_affine",
forward, forward,
init=init, init=init,
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
params={"W": None, "b": None, "pad": None}, params={"W": None, "b": None, "pad": None},
attrs={"dropout_rate": dropout}
) )
return model return model
@ -48,17 +49,14 @@ def forward(model, X, is_train):
model.inc_grad("b", dY.sum(axis=0)) model.inc_grad("b", dY.sum(axis=0))
dY = dY.reshape((dY.shape[0], nO * nP)) dY = dY.reshape((dY.shape[0], nO * nP))
Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3))) Wopfi = W.transpose((1, 2, 0, 3))
Wopfi = Wopfi.reshape((nO * nP, nF * nI)) Wopfi = Wopfi.reshape((nO * nP, nF * nI))
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
# Reuse the buffer dWopfi = model.ops.gemm(dY, Xf, trans1=True)
dWopfi = Wopfi
dWopfi.fill(0.0)
model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
dWopfi = dWopfi.reshape((nO, nP, nF, nI)) dWopfi = dWopfi.reshape((nO, nP, nF, nI))
# (o, p, f, i) --> (f, o, p, i) # (o, p, f, i) --> (f, o, p, i)
dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3))) dWopfi = dWopfi.transpose((2, 0, 1, 3))
model.inc_grad("W", dWopfi) model.inc_grad("W", dWopfi)
return dXf.reshape((dXf.shape[0], nF, nI)) return dXf.reshape((dXf.shape[0], nF, nI))

View File

@ -87,16 +87,16 @@ def build_text_classifier(
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed( lower = HashEmbed(
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
) )
prefix = HashEmbed( prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
) )
suffix = HashEmbed( suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
) )
shape = HashEmbed( shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
) )
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])

View File

@ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces):
def MultiHashEmbed( def MultiHashEmbed(
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
): ):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6)
if use_subwords: if use_subwords:
prefix = HashEmbed( prefix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7
) )
suffix = HashEmbed( suffix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8
) )
shape = HashEmbed( shape = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9
) )
if pretrained_vectors: if pretrained_vectors:
@ -192,7 +192,7 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1") @registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5)
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
with Model.define_operators({">>": chain, "|": concatenate}): with Model.define_operators({">>": chain, "|": concatenate}):
embed_layer = chr_embed | features >> with_array(norm) embed_layer = chr_embed | features >> with_array(norm)
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed( norm = HashEmbed(
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout, nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
seed=0 seed=0
) )
if subword_features: if subword_features:
prefix = HashEmbed( prefix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout, nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
seed=1 seed=1
) )
suffix = HashEmbed( suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout, nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
seed=2 seed=2
) )
shape = HashEmbed( shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout, nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
seed=3 seed=3
) )
else: else:
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
>> Maxout( >> Maxout(
nO=width, nO=width,
nI=width * columns, nI=width * columns,
nP=maxout_pieces, nP=3,
dropout=0.0, dropout=0.0,
normalize=True, normalize=True,
), ),
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
>> Maxout( >> Maxout(
nO=width, nO=width,
nI=width * columns, nI=width * columns,
nP=maxout_pieces, nP=3,
dropout=0.0, dropout=0.0,
normalize=True, normalize=True,
), ),
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
>> Maxout( >> Maxout(
nO=width, nO=width,
nI=width * columns, nI=width * columns,
nP=maxout_pieces, nP=3,
dropout=0.0, dropout=0.0,
normalize=True, normalize=True,
), ),
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
reduce_dimensions = Maxout( reduce_dimensions = Maxout(
nO=width, nO=width,
nI=nM * nC + width, nI=nM * nC + width,
nP=maxout_pieces, nP=3,
dropout=0.0, dropout=0.0,
normalize=True, normalize=True,
) )

View File

@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
from ..syntax._parser_model import ParserStepModel from ..syntax._parser_model import ParserStepModel
def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
"""Set up a stepwise transition-based model""" """Set up a stepwise transition-based model"""
if upper is None: if upper is None:
has_upper = False has_upper = False

View File

@ -272,7 +272,7 @@ cdef class Morphology:
@staticmethod @staticmethod
def feats_to_dict(feats): def feats_to_dict(feats):
if not feats: if not feats or feats == Morphology.EMPTY_MORPH:
return {} return {}
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}

View File

@ -3,7 +3,7 @@ cimport numpy as np
import numpy import numpy
import srsly import srsly
from thinc.api import to_categorical from thinc.api import SequenceCategoricalCrossentropy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
doc.is_morphed = True doc.is_morphed = True
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores) loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
tag_index = {tag: i for i, tag in enumerate(self.labels)} truths = []
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for eg in examples: for eg in examples:
eg_truths = []
pos_tags = eg.get_aligned("POS", as_string=True) pos_tags = eg.get_aligned("POS", as_string=True)
morphs = eg.get_aligned("MORPH", as_string=True) morphs = eg.get_aligned("MORPH", as_string=True)
for i in range(len(morphs)): for i in range(len(morphs)):
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
morph = self.vocab.strings[self.vocab.morphology.add(feats)] morph = self.vocab.strings[self.vocab.morphology.add(feats)]
if morph == "": if morph == "":
morph = Morphology.EMPTY_MORPH morph = Morphology.EMPTY_MORPH
if morph is None: eg_truths.append(morph)
correct[idx] = guesses[idx] truths.append(eg_truths)
elif morph in tag_index: d_scores, loss = loss_func(scores, truths)
correct[idx] = tag_index[morph] if self.model.ops.xp.isnan(loss):
else: raise ValueError("nan value when computing loss")
correct[idx] = 0
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def to_bytes(self, exclude=tuple()): def to_bytes(self, exclude=tuple()):

View File

@ -58,12 +58,8 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
predictions = self.predict([doc]) scores = self.predict([doc])
if isinstance(predictions, tuple) and len(predictions) == 2: self.set_annotations([doc], scores)
scores, tensors = predictions
self.set_annotations([doc], scores, tensors=tensors)
else:
self.set_annotations([doc], predictions)
return doc return doc
def pipe(self, stream, batch_size=128): def pipe(self, stream, batch_size=128):
@ -73,12 +69,8 @@ class Pipe(object):
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs) scores = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: self.set_annotations(docs, scores)
scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors)
else:
self.set_annotations(docs, predictions)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -87,7 +79,7 @@ class Pipe(object):
""" """
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores, tensors=None): def set_annotations(self, docs, scores):
"""Modify a batch of documents, using pre-computed scores.""" """Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError raise NotImplementedError
@ -281,9 +273,10 @@ class Tagger(Pipe):
idx += 1 idx += 1
doc.is_tagged = True doc.is_tagged = True
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
if losses is not None and self.name not in losses: if losses is None:
losses[self.name] = 0. losses = {}
losses.setdefault(self.name, 0.0)
try: try:
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
@ -303,11 +296,11 @@ class Tagger(Pipe):
if sgd not in (None, False): if sgd not in (None, False):
self.model.finish_update(sgd) self.model.finish_update(sgd)
if losses is not None: losses[self.name] += loss
losses[self.name] += loss
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, self._scores2guesses(tag_scores)) self.set_annotations(docs, self._scores2guesses(tag_scores))
return losses
def rehearse(self, examples, drop=0., sgd=None, losses=None): def rehearse(self, examples, drop=0., sgd=None, losses=None):
"""Perform a 'rehearsal' update, where we try to match the output of """Perform a 'rehearsal' update, where we try to match the output of
@ -334,7 +327,7 @@ class Tagger(Pipe):
losses[self.name] += (gradient**2).sum() losses[self.name] += (gradient**2).sum()
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
loss_func = SequenceCategoricalCrossentropy(names=self.labels) loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
truths = [eg.get_aligned("tag", as_string=True) for eg in examples] truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
d_scores, loss = loss_func(scores, truths) d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss): if self.model.ops.xp.isnan(loss):
@ -521,29 +514,23 @@ class SentenceRecognizer(Tagger):
doc.c[j].sent_start = -1 doc.c[j].sent_start = -1
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores) labels = self.labels
tag_index = range(len(self.labels)) loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
cdef int idx = 0 truths = []
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for eg in examples: for eg in examples:
sent_starts = eg.get_aligned("sent_start") eg_truth = []
for sent_start in sent_starts: for x in eg.get_aligned("sent_start"):
if sent_start is None: if x == None:
correct[idx] = guesses[idx] eg_truth.append(None)
elif sent_start in tag_index: elif x == 1:
correct[idx] = sent_start eg_truth.append(labels[1])
else: else:
correct[idx] = 0 # anything other than 1: 0, -1, -1 as uint64
known_labels[idx] = 0. eg_truth.append(labels[0])
idx += 1 truths.append(eg_truth)
correct = self.model.ops.xp.array(correct, dtype="i") d_scores, loss = loss_func(scores, truths)
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) if self.model.ops.xp.isnan(loss):
d_scores *= self.model.ops.asarray(known_labels) raise ValueError("nan value when computing loss")
loss = (d_scores**2).sum()
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
@ -641,7 +628,7 @@ class MultitaskObjective(Tagger):
def labels(self, value): def labels(self, value):
self.cfg["labels"] = value self.cfg["labels"] = value
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, get_examples=lambda: [], pipeline=None, def begin_training(self, get_examples=lambda: [], pipeline=None,
@ -738,7 +725,7 @@ class ClozeMultitask(Pipe):
self.cfg = cfg self.cfg = cfg
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, get_examples=lambda: [], pipeline=None, def begin_training(self, get_examples=lambda: [], pipeline=None,
@ -767,7 +754,7 @@ class ClozeMultitask(Pipe):
loss = self.distance.get_loss(prediction, target) loss = self.distance.get_loss(prediction, target)
return loss, gradient return loss, gradient
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
pass pass
def rehearse(self, examples, drop=0., sgd=None, losses=None): def rehearse(self, examples, drop=0., sgd=None, losses=None):
@ -815,8 +802,8 @@ class TextCategorizer(Pipe):
def pipe(self, stream, batch_size=128): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
scores, tensors = self.predict(docs) scores = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -826,22 +813,25 @@ class TextCategorizer(Pipe):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
xp = get_array_module(tensors) xp = get_array_module(tensors)
scores = xp.zeros((len(docs), len(self.labels))) scores = xp.zeros((len(docs), len(self.labels)))
return scores, tensors return scores
scores = self.model.predict(docs) scores = self.model.predict(docs)
scores = self.model.ops.asarray(scores) scores = self.model.ops.asarray(scores)
return scores, tensors return scores
def set_annotations(self, docs, scores, tensors=None): def set_annotations(self, docs, scores):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
for j, label in enumerate(self.labels): for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j]) doc.cats[label] = float(scores[i, j])
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
try: try:
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
return return losses
except AttributeError: except AttributeError:
types = set([type(eg) for eg in examples]) types = set([type(eg) for eg in examples])
raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
@ -853,12 +843,11 @@ class TextCategorizer(Pipe):
bp_scores(d_scores) bp_scores(d_scores)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.model.finish_update(sgd)
if losses is not None: losses[self.name] += loss
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores) self.set_annotations(docs, scores=scores)
return losses
def rehearse(self, examples, drop=0., sgd=None, losses=None): def rehearse(self, examples, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None: if self._rehearsal_model is None:
@ -1082,12 +1071,13 @@ class EntityLinker(Pipe):
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd
def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
self.require_kb() self.require_kb()
if losses is not None: if losses is None:
losses.setdefault(self.name, 0.0) losses = {}
losses.setdefault(self.name, 0.0)
if not examples: if not examples:
return 0 return losses
sentence_docs = [] sentence_docs = []
try: try:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
@ -1130,20 +1120,19 @@ class EntityLinker(Pipe):
return 0.0 return 0.0
sentence_encodings, bp_context = self.model.begin_update(sentence_docs) sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
loss, d_scores = self.get_similarity_loss( loss, d_scores = self.get_similarity_loss(
scores=sentence_encodings, sentence_encodings=sentence_encodings,
examples=examples examples=examples
) )
bp_context(d_scores) bp_context(d_scores)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.model.finish_update(sgd)
if losses is not None: losses[self.name] += loss
losses[self.name] += loss
if set_annotations: if set_annotations:
self.set_annotations(docs, predictions) self.set_annotations(docs, predictions)
return loss return losses
def get_similarity_loss(self, examples, scores): def get_similarity_loss(self, examples, sentence_encodings):
entity_encodings = [] entity_encodings = []
for eg in examples: for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
@ -1155,41 +1144,23 @@ class EntityLinker(Pipe):
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
if scores.shape != entity_encodings.shape: if sentence_encodings.shape != entity_encodings.shape:
raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
gradients = self.distance.get_grad(scores, entity_encodings) gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
loss = self.distance.get_loss(scores, entity_encodings) loss = self.distance.get_loss(sentence_encodings, entity_encodings)
loss = loss / len(entity_encodings) loss = loss / len(entity_encodings)
return loss, gradients return loss, gradients
def get_loss(self, examples, scores):
cats = []
for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents:
kb_id = kb_ids[ent.start]
if kb_id:
cats.append([1.0])
cats = self.model.ops.asarray(cats, dtype="float32")
if len(scores) != len(cats):
raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
d_scores = (scores - cats)
loss = (d_scores ** 2).sum()
loss = loss / len(cats)
return loss, d_scores
def __call__(self, doc): def __call__(self, doc):
kb_ids, tensors = self.predict([doc]) kb_ids = self.predict([doc])
self.set_annotations([doc], kb_ids, tensors=tensors) self.set_annotations([doc], kb_ids)
return doc return doc
def pipe(self, stream, batch_size=128): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
kb_ids, tensors = self.predict(docs) kb_ids = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors) self.set_annotations(docs, kb_ids)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -1197,10 +1168,9 @@ class EntityLinker(Pipe):
self.require_kb() self.require_kb()
entity_count = 0 entity_count = 0
final_kb_ids = [] final_kb_ids = []
final_tensors = []
if not docs: if not docs:
return final_kb_ids, final_tensors return final_kb_ids
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
@ -1234,21 +1204,18 @@ class EntityLinker(Pipe):
if to_discard and ent.label_ in to_discard: if to_discard and ent.label_ in to_discard:
# ignoring this entity - setting to NIL # ignoring this entity - setting to NIL
final_kb_ids.append(self.NIL) final_kb_ids.append(self.NIL)
final_tensors.append(sentence_encoding)
else: else:
candidates = self.kb.get_candidates(ent.text) candidates = self.kb.get_candidates(ent.text)
if not candidates: if not candidates:
# no prediction possible for this entity - setting to NIL # no prediction possible for this entity - setting to NIL
final_kb_ids.append(self.NIL) final_kb_ids.append(self.NIL)
final_tensors.append(sentence_encoding)
elif len(candidates) == 1: elif len(candidates) == 1:
# shortcut for efficiency reasons: take the 1 candidate # shortcut for efficiency reasons: take the 1 candidate
# TODO: thresholding # TODO: thresholding
final_kb_ids.append(candidates[0].entity_) final_kb_ids.append(candidates[0].entity_)
final_tensors.append(sentence_encoding)
else: else:
random.shuffle(candidates) random.shuffle(candidates)
@ -1277,14 +1244,13 @@ class EntityLinker(Pipe):
best_index = scores.argmax().item() best_index = scores.argmax().item()
best_candidate = candidates[best_index] best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_) final_kb_ids.append(best_candidate.entity_)
final_tensors.append(sentence_encoding)
if not (len(final_tensors) == len(final_kb_ids) == entity_count): if not (len(final_kb_ids) == entity_count):
raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
return final_kb_ids, final_tensors return final_kb_ids
def set_annotations(self, docs, kb_ids, tensors=None): def set_annotations(self, docs, kb_ids):
count_ents = len([ent for doc in docs for ent in doc.ents]) count_ents = len([ent for doc in docs for ent in doc.ents])
if count_ents != len(kb_ids): if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
@ -1400,11 +1366,7 @@ class Sentencizer(Pipe):
def pipe(self, stream, batch_size=128): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs) predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: self.set_annotations(docs, predictions)
scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors)
else:
self.set_annotations(docs, predictions)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -1435,7 +1397,7 @@ class Sentencizer(Pipe):
guesses.append(doc_guesses) guesses.append(doc_guesses)
return guesses return guesses
def set_annotations(self, docs, batch_tag_ids, tensors=None): def set_annotations(self, docs, batch_tag_ids):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc

View File

@ -57,7 +57,7 @@ class SimpleNER(Pipe):
scores = self.model.predict(docs) scores = self.model.predict(docs)
return scores return scores
def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None): def set_annotations(self, docs: List[Doc], scores: List[Floats2d]):
"""Set entities on a batch of documents from a batch of scores.""" """Set entities on a batch of documents from a batch of scores."""
tag_names = self.get_tag_names() tag_names = self.get_tag_names()
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
@ -67,9 +67,12 @@ class SimpleNER(Pipe):
tags = iob_to_biluo(tags) tags = iob_to_biluo(tags)
doc.ents = spans_from_biluo_tags(doc, tags) doc.ents = spans_from_biluo_tags(doc, tags)
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
if losses is None:
losses = {}
losses.setdefault("ner", 0.0)
if not any(_has_ner(eg) for eg in examples): if not any(_has_ner(eg) for eg in examples):
return 0 return losses
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update(docs) scores, bp_scores = self.model.begin_update(docs)
@ -79,10 +82,8 @@ class SimpleNER(Pipe):
self.set_annotations(docs, scores) self.set_annotations(docs, scores)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.model.finish_update(sgd)
if losses is not None: losses["ner"] += loss
losses.setdefault("ner", 0.0) return losses
losses["ner"] += loss
return loss
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
loss = 0 loss = 0

View File

@ -83,12 +83,14 @@ class Tok2Vec(Pipe):
assert tokvecs.shape[0] == len(doc) assert tokvecs.shape[0] == len(doc)
doc.tensor = tokvecs doc.tensor = tokvecs
def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False): def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False):
"""Update the model. """Update the model.
examples (iterable): A batch of examples examples (Iterable[Example]): A batch of examples
drop (float): The droput rate. drop (float): The droput rate.
sgd (callable): An optimizer. sgd (Optimizer): An optimizer.
RETURNS (dict): Results from the update. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
set_annotations (bool): whether or not to update the examples with the predictions
RETURNS (Dict[str, float]): The updated losses dictionary
""" """
if losses is None: if losses is None:
losses = {} losses = {}
@ -124,6 +126,7 @@ class Tok2Vec(Pipe):
self.listeners[-1].receive(batch_id, tokvecs, backprop) self.listeners[-1].receive(batch_id, tokvecs, backprop)
if set_annotations: if set_annotations:
self.set_annotations(docs, tokvecs) self.set_annotations(docs, tokvecs)
return losses
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
pass pass

View File

@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
class ProjectConfigAsset(BaseModel): class ProjectConfigAsset(BaseModel):
# fmt: off # fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset") dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: StrictStr = Field(..., title="URL of asset") url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
# fmt: on # fmt: on
@ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel):
name: StrictStr = Field(..., title="Name of command") name: StrictStr = Field(..., title="Name of command")
help: Optional[StrictStr] = Field(None, title="Command description") help: Optional[StrictStr] = Field(None, title="Command description")
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
deps: List[StrictStr] = Field([], title="Data Version Control dependencies") deps: List[StrictStr] = Field([], title="File dependencies required by this command")
outputs: List[StrictStr] = Field([], title="Data Version Control outputs") outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
# fmt: on # fmt: on
class Config: class Config:
@ -246,7 +247,7 @@ class ProjectConfigSchema(BaseModel):
# fmt: off # fmt: off
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
assets: List[ProjectConfigAsset] = Field([], title="Data assets") assets: List[ProjectConfigAsset] = Field([], title="Data assets")
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
# fmt: on # fmt: on

View File

@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
class ParserStepModel(Model): class ParserStepModel(Model):
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True): def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
dropout=0.1):
Model.__init__(self, name="parser_step_model", forward=step_forward) Model.__init__(self, name="parser_step_model", forward=step_forward)
self.attrs["has_upper"] = has_upper self.attrs["has_upper"] = has_upper
self.attrs["dropout_rate"] = dropout
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
if layers[1].get_dim("nP") >= 2: if layers[1].get_dim("nP") >= 2:
activation = "maxout" activation = "maxout"
@ -243,6 +245,13 @@ class ParserStepModel(Model):
for class_ in unseen_classes: for class_ in unseen_classes:
self._class_mask[class_] = 0. self._class_mask[class_] = 0.
def clear_memory(self):
del self.tokvecs
del self.bp_tokvecs
del self.state2vec
del self.backprops
del self._class_mask
@property @property
def nO(self): def nO(self):
if self.attrs["has_upper"]: if self.attrs["has_upper"]:
@ -271,6 +280,19 @@ class ParserStepModel(Model):
c_ids += ids.shape[1] c_ids += ids.shape[1]
return ids return ids
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((
util.get_async(self.cuda_stream, token_ids),
util.get_async(self.cuda_stream, d_vector),
get_d_tokvecs
))
else:
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
def finish_steps(self, golds): def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing # Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient. # values don't affect the real gradient.
@ -289,11 +311,17 @@ class ParserStepModel(Model):
self.bp_tokvecs(d_tokvecs[:-1]) self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs return d_tokvecs
NUMPY_OPS = NumpyOps()
def step_forward(model: ParserStepModel, states, is_train): def step_forward(model: ParserStepModel, states, is_train):
token_ids = model.get_token_ids(states) token_ids = model.get_token_ids(states)
vector, get_d_tokvecs = model.state2vec(token_ids, is_train) vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
mask = None
if model.attrs["has_upper"]: if model.attrs["has_upper"]:
dropout_rate = model.attrs["dropout_rate"]
if is_train and dropout_rate > 0:
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
vector *= mask
scores, get_d_vector = model.vec2scores(vector, is_train) scores, get_d_vector = model.vec2scores(vector, is_train)
else: else:
scores = NumpyOps().asarray(vector) scores = NumpyOps().asarray(vector)
@ -305,16 +333,9 @@ def step_forward(model: ParserStepModel, states, is_train):
# Zero vectors for unseen classes # Zero vectors for unseen classes
d_scores *= model._class_mask d_scores *= model._class_mask
d_vector = get_d_vector(d_scores) d_vector = get_d_vector(d_scores)
if isinstance(model.state2vec.ops, CupyOps) \ if mask is not None:
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): d_vector *= mask
# Move token_ids and d_vector to GPU, asynchronously model.backprop_step(token_ids, d_vector, get_d_tokvecs)
model.backprops.append((
util.get_async(model.cuda_stream, token_ids),
util.get_async(model.cuda_stream, d_vector),
get_d_tokvecs
))
else:
model.backprops.append((token_ids, d_vector, get_d_tokvecs))
return None return None
return scores, backprop_parser_step return scores, backprop_parser_step
@ -437,7 +458,7 @@ cdef class precompute_hiddens:
sum_state_features(<float*>state_vector.data, sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0], feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector = state_vector + self.bias state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector) state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector_ids): def backward(d_state_vector_ids):

View File

@ -65,7 +65,6 @@ cdef class Parser:
self.set_output(self.moves.n_moves) self.set_output(self.moves.n_moves)
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault("update_with_oracle_cut_size", 100) self.cfg.setdefault("update_with_oracle_cut_size", 100)
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
self._multitasks = [] self._multitasks = []
for multitask in cfg.get("multitasks", []): for multitask in cfg.get("multitasks", []):
self.add_multitask_objective(multitask) self.add_multitask_objective(multitask)
@ -154,7 +153,7 @@ cdef class Parser:
doc (Doc): The document to be processed. doc (Doc): The document to be processed.
""" """
states = self.predict([doc]) states = self.predict([doc])
self.set_annotations([doc], states, tensors=None) self.set_annotations([doc], states)
return doc return doc
def pipe(self, docs, int batch_size=256): def pipe(self, docs, int batch_size=256):
@ -171,7 +170,7 @@ cdef class Parser:
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
subbatch = list(subbatch) subbatch = list(subbatch)
parse_states = self.predict(subbatch) parse_states = self.predict(subbatch)
self.set_annotations(subbatch, parse_states, tensors=None) self.set_annotations(subbatch, parse_states)
yield from batch_in_order yield from batch_in_order
def predict(self, docs): def predict(self, docs):
@ -201,6 +200,8 @@ cdef class Parser:
with nogil: with nogil:
self._parseC(&states[0], self._parseC(&states[0],
weights, sizes) weights, sizes)
model.clear_memory()
del model
return batch return batch
cdef void _parseC(self, StateC** states, cdef void _parseC(self, StateC** states,
@ -223,7 +224,7 @@ cdef class Parser:
unfinished.clear() unfinished.clear()
free_activations(&activations) free_activations(&activations)
def set_annotations(self, docs, states, tensors=None): def set_annotations(self, docs, states):
cdef StateClass state cdef StateClass state
cdef Doc doc cdef Doc doc
for i, (state, doc) in enumerate(zip(states, docs)): for i, (state, doc) in enumerate(zip(states, docs)):
@ -264,7 +265,7 @@ cdef class Parser:
states[i].push_hist(guess) states[i].push_hist(guess)
free(is_valid) free(is_valid)
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
cdef StateClass state cdef StateClass state
if losses is None: if losses is None:
losses = {} losses = {}
@ -280,11 +281,12 @@ cdef class Parser:
[eg.predicted for eg in examples]) [eg.predicted for eg in examples])
if self.cfg["update_with_oracle_cut_size"] >= 1: if self.cfg["update_with_oracle_cut_size"] >= 1:
# Chop sequences into lengths of this many transitions, to make the # Chop sequences into lengths of this many transitions, to make the
# batch uniform length. We randomize this to overfit less. # batch uniform length.
# We used to randomize this, but it's not clear that actually helps?
cut_size = self.cfg["update_with_oracle_cut_size"] cut_size = self.cfg["update_with_oracle_cut_size"]
states, golds, max_steps = self._init_gold_batch( states, golds, max_steps = self._init_gold_batch(
examples, examples,
max_length=numpy.random.choice(range(5, cut_size)) max_length=cut_size
) )
else: else:
states, golds, _ = self.moves.init_gold_batch(examples) states, golds, _ = self.moves.init_gold_batch(examples)
@ -292,24 +294,15 @@ cdef class Parser:
if not states: if not states:
return losses return losses
all_states = list(states) all_states = list(states)
states_golds = zip(states, golds) states_golds = list(zip(states, golds))
for _ in range(max_steps): while states_golds:
if not states_golds:
break
states, golds = zip(*states_golds) states, golds = zip(*states_golds)
scores, backprop = model.begin_update(states) scores, backprop = model.begin_update(states)
d_scores = self.get_batch_loss(states, golds, scores, losses) d_scores = self.get_batch_loss(states, golds, scores, losses)
if self.cfg["normalize_gradients_with_batch_size"]: # Note that the gradient isn't normalized by the batch size
# We have to be very careful how we do this, because of the way we # here, because our "samples" are really the states...But we
# cut up the batch. We subdivide long sequences. If we normalize # can't normalize by the number of states either, as then we'd
# naively, we end up normalizing by sequence length, which # be getting smaller gradients for states in long sequences.
# is bad: that would mean that states in long sequences
# consistently get smaller gradients. Imagine if we have two
# sequences, one length 1000, one length 20. If we cut up
# the 1k sequence so that we have a "batch" of 50 subsequences,
# we don't want the gradients to get 50 times smaller!
d_scores /= n_examples
backprop(d_scores) backprop(d_scores)
# Follow the predicted action # Follow the predicted action
self.transition_states(states, scores) self.transition_states(states, scores)
@ -321,6 +314,13 @@ cdef class Parser:
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, all_states) self.set_annotations(docs, all_states)
# Ugh, this is annoying. If we're working on GPU, we want to free the
# memory ASAP. It seems that Python doesn't necessarily get around to
# removing these in time if we don't explicitly delete? It's confusing.
del backprop
del backprop_tok2vec
model.clear_memory()
del model
return losses return losses
def rehearse(self, examples, sgd=None, losses=None, **cfg): def rehearse(self, examples, sgd=None, losses=None, **cfg):
@ -344,7 +344,7 @@ cdef class Parser:
set_dropout_rate(self._rehearsal_model, 0.0) set_dropout_rate(self._rehearsal_model, 0.0)
set_dropout_rate(self.model, 0.0) set_dropout_rate(self.model, 0.0)
tutor, _ = self._rehearsal_model.begin_update(docs) tutor, _ = self._rehearsal_model.begin_update(docs)
model, finish_update = self.model.begin_update(docs) model, backprop_tok2vec = self.model.begin_update(docs)
n_scores = 0. n_scores = 0.
loss = 0. loss = 0.
while states: while states:
@ -360,10 +360,16 @@ cdef class Parser:
states = [state for state in states if not state.is_final()] states = [state for state in states if not state.is_final()]
n_scores += d_scores.size n_scores += d_scores.size
# Do the backprop # Do the backprop
finish_update(docs) backprop_tok2vec(docs)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.model.finish_update(sgd)
losses[self.name] += loss / n_scores losses[self.name] += loss / n_scores
del backprop
del backprop_tok2vec
model.clear_memory()
tutor.clear_memory()
del model
del tutor
return losses return losses
def get_gradients(self): def get_gradients(self):
@ -407,6 +413,7 @@ cdef class Parser:
cpu_log_loss(c_d_scores, cpu_log_loss(c_d_scores,
costs, is_valid, &scores[i, 0], d_scores.shape[1]) costs, is_valid, &scores[i, 0], d_scores.shape[1])
c_d_scores += d_scores.shape[1] c_d_scores += d_scores.shape[1]
# Note that we don't normalize this. See comment in update() for why.
if losses is not None: if losses is not None:
losses.setdefault(self.name, 0.) losses.setdefault(self.name, 0.)
losses[self.name] += (d_scores**2).sum() losses[self.name] += (d_scores**2).sum()
@ -525,21 +532,25 @@ cdef class Parser:
StateClass state StateClass state
Transition action Transition action
all_states = self.moves.init_batch([eg.predicted for eg in examples]) all_states = self.moves.init_batch([eg.predicted for eg in examples])
states = []
golds = []
kept = [] kept = []
max_length_seen = 0 max_length_seen = 0
for state, eg in zip(all_states, examples): for state, eg in zip(all_states, examples):
if self.moves.has_gold(eg) and not state.is_final(): if self.moves.has_gold(eg) and not state.is_final():
gold = self.moves.init_gold(state, eg) gold = self.moves.init_gold(state, eg)
oracle_actions = self.moves.get_oracle_sequence_from_state( if len(eg.x) < max_length:
state.copy(), gold) states.append(state)
kept.append((eg, state, gold, oracle_actions)) golds.append(gold)
min_length = min(min_length, len(oracle_actions)) else:
max_length_seen = max(max_length, len(oracle_actions)) oracle_actions = self.moves.get_oracle_sequence_from_state(
state.copy(), gold)
kept.append((eg, state, gold, oracle_actions))
min_length = min(min_length, len(oracle_actions))
max_length_seen = max(max_length, len(oracle_actions))
if not kept: if not kept:
return [], [], 0 return states, golds, 0
max_length = max(min_length, min(max_length, max_length_seen)) max_length = max(min_length, min(max_length, max_length_seen))
states = []
golds = []
cdef int clas cdef int clas
max_moves = 0 max_moves = 0
for eg, state, gold, oracle_actions in kept: for eg, state, gold, oracle_actions in kept:

View File

@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
assert contains_cycle(tree) is None assert contains_cycle(tree) is None
assert contains_cycle(cyclic_tree) == set([3, 4, 5]) assert contains_cycle(cyclic_tree) == {3, 4, 5}
assert contains_cycle(partial_tree) is None assert contains_cycle(partial_tree) is None
assert contains_cycle(multirooted_tree) is None assert contains_cycle(multirooted_tree) is None

View File

@ -198,10 +198,10 @@ def test_overfitting_IO():
nlp.add_pipe(parser) nlp.add_pipe(parser)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for i in range(50): for i in range(100):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["parser"] < 0.00001 assert losses["parser"] < 0.0001
# test the trained model # test the trained model
test_text = "I like securities." test_text = "I like securities."

View File

@ -38,6 +38,11 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# add some cases where SENT_START == -1
train_examples[0].reference[10].is_sent_start = False
train_examples[1].reference[1].is_sent_start = False
train_examples[1].reference[11].is_sent_start = False
nlp.add_pipe(senter) nlp.add_pipe(senter)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()

View File

@ -84,7 +84,7 @@ def test_overfitting_IO():
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
fix_random_seed(0) fix_random_seed(0)
nlp = English() nlp = English()
textcat = nlp.create_pipe("textcat") textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True})
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))

View File

@ -23,6 +23,7 @@ def test_issue2070():
assert len(doc) == 11 assert len(doc) == 11
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue2179(): def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER.""" """Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian() nlp = Italian()
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
assert len(matches) == 3 assert len(matches) == 3
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue2482(): def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model.""" """Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian() nlp = Italian()

View File

@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
assert doc[0].like_num assert doc[0].like_num
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue2800(): def test_issue2800():
"""Test issue that arises when too many labels are added to NER model. """Test issue that arises when too many labels are added to NER model.
Used to cause segfault. Used to cause segfault.
""" """
nlp = English() nlp = English()
train_data = [] train_data = []
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]) train_data.extend(
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
)
entity_types = [str(i) for i in range(1000)] entity_types = [str(i) for i in range(1000)]
ner = nlp.create_pipe("ner") ner = nlp.create_pipe("ner")
nlp.add_pipe(ner) nlp.add_pipe(ner)

View File

@ -88,6 +88,7 @@ def test_issue3199():
assert list(doc[0:3].noun_chunks) == [] assert list(doc[0:3].noun_chunks) == []
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3209(): def test_issue3209():
"""Test issue that occurred in spaCy nightly where NER labels were being """Test issue that occurred in spaCy nightly where NER labels were being
mapped to classes incorrectly after loading the model, when the labels mapped to classes incorrectly after loading the model, when the labels

View File

@ -0,0 +1,472 @@
import pytest
from spacy.language import Language
from spacy.vocab import Vocab
from spacy.pipeline import EntityRuler, DependencyParser
from spacy.pipeline.defaults import default_parser
from spacy import displacy, load
from spacy.displacy import parse_deps
from spacy.tokens import Doc, Token
from spacy.matcher import Matcher, PhraseMatcher
from spacy.errors import MatchPatternError
from spacy.util import minibatch
from spacy.gold import Example
from spacy.lang.hi import Hindi
from spacy.lang.es import Spanish
from spacy.lang.en import English
from spacy.attrs import IS_ALPHA
from thinc.api import compounding
import spacy
import srsly
import numpy
from ..util import make_tempdir, get_doc
@pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop
def test_issue_3526_1(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
ruler_bytes = ruler.to_bytes()
assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4
assert ruler.overwrite
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(ruler_bytes)
assert len(new_ruler) == len(ruler)
assert len(new_ruler.labels) == 4
assert new_ruler.overwrite == ruler.overwrite
assert new_ruler.ent_id_sep == ruler.ent_id_sep
def test_issue_3526_2(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(bytes_old_style)
assert len(new_ruler) == len(ruler)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert new_ruler.overwrite is not ruler.overwrite
def test_issue_3526_3(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
with make_tempdir() as tmpdir:
out_file = tmpdir / "entity_ruler"
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
new_ruler = EntityRuler(nlp).from_disk(out_file)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert len(new_ruler) == len(ruler)
assert new_ruler.overwrite is not ruler.overwrite
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue_3526_4(en_vocab):
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
nlp.add_pipe(ruler)
with make_tempdir() as tmpdir:
nlp.to_disk(tmpdir)
ruler = nlp.get_pipe("entity_ruler")
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert ruler.overwrite is True
nlp2 = load(tmpdir)
new_ruler = nlp2.get_pipe("entity_ruler")
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert new_ruler.overwrite is True
def test_issue3531():
"""Test that displaCy renderer doesn't require "settings" key."""
example_dep = {
"words": [
{"text": "But", "tag": "CCONJ"},
{"text": "Google", "tag": "PROPN"},
{"text": "is", "tag": "VERB"},
{"text": "starting", "tag": "VERB"},
{"text": "from", "tag": "ADP"},
{"text": "behind.", "tag": "ADV"},
],
"arcs": [
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
],
}
example_ent = {
"text": "But Google is starting from behind.",
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
}
dep_html = displacy.render(example_dep, style="dep", manual=True)
assert dep_html
ent_html = displacy.render(example_ent, style="ent", manual=True)
assert ent_html
def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = numpy.asarray(
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
dtype="f",
)
doc = Doc(en_vocab, words=words)
doc.tensor = tensor
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
assert [token.lemma_ for token in doc] == gold_lemma
vectors_1 = [token.vector for token in doc]
assert len(vectors_1) == len(doc)
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.lemma_ for token in doc] == gold_lemma
vectors_2 = [token.vector for token in doc]
assert len(vectors_2) == len(doc)
assert vectors_1[0].tolist() == vectors_2[0].tolist()
assert vectors_1[1].tolist() == vectors_2[1].tolist()
assert vectors_1[2].tolist() == vectors_2[2].tolist()
assert vectors_1[4].tolist() == vectors_2[5].tolist()
assert vectors_1[5].tolist() == vectors_2[6].tolist()
def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("GOOD", [pattern])
with pytest.raises(MatchPatternError):
matcher.add("BAD", [[{"X": "Y"}]])
@pytest.mark.xfail
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)
def test_issue3611():
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = ["offensive", "inoffensive"]
x_train = [
"This is an offensive text",
"This is the second offensive text",
"inoff",
]
y_train = ["offensive", "offensive", "inoffensive"]
nlp = spacy.blank("en")
# preparing the data
train_data = []
for text, train_instance in zip(x_train, y_train):
cat_dict = {label: label == train_instance for label in unique_classes}
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
# add a text categorizer component
textcat = nlp.create_pipe(
"textcat",
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
)
for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training(X=x_train, Y=y_train)
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
def test_issue3625():
"""Test that default punctuation rules applies to hindi unicode characters"""
nlp = Hindi()
doc = nlp("hi. how हुए. होटल, होटल")
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
assert [token.text for token in doc] == expected
def test_issue3803():
"""Test that spanish num-like tokens have True for like_num attribute."""
nlp = Spanish()
text = "2 dos 1000 mil 12 doce"
doc = nlp(text)
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
assert "subtok" not in parser.labels
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {
"learn_tokens": True,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
assert "subtok" in parser.labels
def test_issue3839(en_vocab):
"""Test that match IDs returned by the matcher are correct, are in the string """
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
matcher = Matcher(en_vocab)
match_id = "PATTERN"
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
matcher.add(match_id, [pattern1])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
matcher = Matcher(en_vocab)
matcher.add(match_id, [pattern2])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
@pytest.mark.parametrize(
"sentence",
[
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
],
)
def test_issue3869(sentence):
"""Test that the Doc's count_by function works consistently"""
nlp = English()
doc = nlp(sentence)
count = 0
for token in doc:
count += token.is_alpha
assert count == doc.count_by(IS_ALPHA).get(1, 0)
def test_issue3879(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
assert len(doc) == 5
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
matcher = Matcher(en_vocab)
matcher.add("TEST", [pattern])
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch.
Fixed in v7.0.5 of Thinc.
"""
texts = ["hello", "world", "", ""]
nlp = English()
nlp.add_pipe(nlp.create_pipe("parser"))
nlp.add_pipe(nlp.create_pipe("ner"))
nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.get_pipe("parser").add_label("dep")
nlp.get_pipe("ner").add_label("PERSON")
nlp.get_pipe("tagger").add_label("NN")
nlp.begin_training()
for doc in nlp.pipe(texts):
pass
def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc.
"""
doc = Doc(en_vocab, words=["Hello", "world"])
doc.is_parsed = True
doc.user_data["test"] = set()
parse_deps(doc)
def test_issue3951(en_vocab):
"""Test that combinations of optional rules are matched correctly."""
matcher = Matcher(en_vocab)
pattern = [
{"LOWER": "hello"},
{"LOWER": "this", "OP": "?"},
{"OP": "?"},
{"LOWER": "world"},
]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
matches = matcher(doc)
assert len(matches) == 0
def test_issue3959():
""" Ensure that a modified pos attribute is serialized correctly."""
nlp = English()
doc = nlp(
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
)
assert doc[0].pos_ == ""
doc[0].pos_ = "NOUN"
assert doc[0].pos_ == "NOUN"
# usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
with make_tempdir() as tmp_dir:
file_path = tmp_dir / "my_doc"
doc.to_disk(file_path)
doc2 = nlp("")
doc2.from_disk(file_path)
assert doc2[0].pos_ == "NOUN"
def test_issue3962(en_vocab):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
# fmt: off
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
# fmt: on
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
span2 = doc[1:5] # "jests at scars ,"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
# head set to itself, being the new artificial root
assert doc2[0].head.text == "jests"
assert doc2[0].dep_ == "dep"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests" # head set to the new artificial root
assert doc2[3].dep_ == "dep"
# We should still have 1 sentence
assert len(list(doc2.sents)) == 1
span3 = doc[6:9] # "never felt a"
doc3 = span3.as_doc()
doc3_json = doc3.to_json()
assert doc3_json
assert doc3[0].head.text == "felt"
assert doc3[0].dep_ == "neg"
assert doc3[1].head.text == "felt"
assert doc3[1].dep_ == "ROOT"
assert doc3[2].head.text == "felt" # head set to ancestor
assert doc3[2].dep_ == "dep"
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len(list(doc3.sents)) == 1
def test_issue3962_long(en_vocab):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
# fmt: off
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
# fmt: on
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
span2 = two_sent_doc[1:7] # "jests at scars. They never"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
# head set to itself, being the new artificial root (in sentence 1)
assert doc2[0].head.text == "jests"
assert doc2[0].dep_ == "ROOT"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests"
assert doc2[3].dep_ == "punct"
# head set to itself, being the new artificial root (in sentence 2)
assert doc2[4].head.text == "They"
assert doc2[4].dep_ == "dep"
# head set to the new artificial head (in sentence 2)
assert doc2[4].head.text == "They"
assert doc2[4].dep_ == "dep"
# We should still have 2 sentences
sents = list(doc2.sents)
assert len(sents) == 2
assert sents[0].text == "jests at scars ."
assert sents[1].text == "They never"
def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
"""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
matches = matcher(doc)
assert len(matches) == 2
# We should have a match for each of the two rules
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
assert "A" in found_ids
assert "B" in found_ids

View File

@ -1,8 +0,0 @@
import pytest
@pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop

View File

@ -1,85 +0,0 @@
import pytest
from spacy.tokens import Span
from spacy.language import Language
from spacy.pipeline import EntityRuler
from spacy import load
import srsly
from ..util import make_tempdir
@pytest.fixture
def patterns():
return [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
@pytest.fixture
def add_ent():
def add_ent_component(doc):
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
return doc
return add_ent_component
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
ruler_bytes = ruler.to_bytes()
assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4
assert ruler.overwrite
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(ruler_bytes)
assert len(new_ruler) == len(ruler)
assert len(new_ruler.labels) == 4
assert new_ruler.overwrite == ruler.overwrite
assert new_ruler.ent_id_sep == ruler.ent_id_sep
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(bytes_old_style)
assert len(new_ruler) == len(ruler)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert new_ruler.overwrite is not ruler.overwrite
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
with make_tempdir() as tmpdir:
out_file = tmpdir / "entity_ruler"
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
new_ruler = EntityRuler(nlp).from_disk(out_file)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert len(new_ruler) == len(ruler)
assert new_ruler.overwrite is not ruler.overwrite
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
nlp.add_pipe(ruler)
with make_tempdir() as tmpdir:
nlp.to_disk(tmpdir)
ruler = nlp.get_pipe("entity_ruler")
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert ruler.overwrite is True
nlp2 = load(tmpdir)
new_ruler = nlp2.get_pipe("entity_ruler")
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert new_ruler.overwrite is True

View File

@ -1,30 +0,0 @@
from spacy import displacy
def test_issue3531():
"""Test that displaCy renderer doesn't require "settings" key."""
example_dep = {
"words": [
{"text": "But", "tag": "CCONJ"},
{"text": "Google", "tag": "PROPN"},
{"text": "is", "tag": "VERB"},
{"text": "starting", "tag": "VERB"},
{"text": "from", "tag": "ADP"},
{"text": "behind.", "tag": "ADV"},
],
"arcs": [
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
],
}
example_ent = {
"text": "But Google is starting from behind.",
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
}
dep_html = displacy.render(example_dep, style="dep", manual=True)
assert dep_html
ent_html = displacy.render(example_ent, style="ent", manual=True)
assert ent_html

View File

@ -1,44 +0,0 @@
from spacy.tokens import Doc
import numpy as np
def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = np.asarray(
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
dtype="f",
)
doc = Doc(en_vocab, words=words)
doc.tensor = tensor
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
assert [token.lemma_ for token in doc] == gold_lemma
vectors_1 = [token.vector for token in doc]
assert len(vectors_1) == len(doc)
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.lemma_ for token in doc] == gold_lemma
vectors_2 = [token.vector for token in doc]
assert len(vectors_2) == len(doc)
assert vectors_1[0].tolist() == vectors_2[0].tolist()
assert vectors_1[1].tolist() == vectors_2[1].tolist()
assert vectors_1[2].tolist() == vectors_2[2].tolist()
assert vectors_1[4].tolist() == vectors_2[5].tolist()
assert vectors_1[5].tolist() == vectors_2[6].tolist()

View File

@ -1,12 +0,0 @@
import pytest
from spacy.matcher import Matcher
from spacy.errors import MatchPatternError
def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("GOOD", [pattern])
with pytest.raises(MatchPatternError):
matcher.add("BAD", [[{"X": "Y"}]])

View File

@ -1,14 +0,0 @@
import pytest
from spacy.tokens import Doc, Token
from spacy.matcher import Matcher
@pytest.mark.xfail
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)

View File

@ -1,45 +0,0 @@
import spacy
from spacy.util import minibatch
from thinc.api import compounding
from spacy.gold import Example
def test_issue3611():
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = ["offensive", "inoffensive"]
x_train = [
"This is an offensive text",
"This is the second offensive text",
"inoff",
]
y_train = ["offensive", "offensive", "inoffensive"]
nlp = spacy.blank("en")
# preparing the data
train_data = []
for text, train_instance in zip(x_train, y_train):
cat_dict = {label: label == train_instance for label in unique_classes}
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
# add a text categorizer component
textcat = nlp.create_pipe(
"textcat",
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
)
for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training(X=x_train, Y=y_train)
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)

View File

@ -1,9 +0,0 @@
from spacy.lang.hi import Hindi
def test_issue3625():
"""Test that default punctuation rules applies to hindi unicode characters"""
nlp = Hindi()
doc = nlp("hi. how हुए. होटल, होटल")
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
assert [token.text for token in doc] == expected

View File

@ -1,10 +0,0 @@
from spacy.lang.es import Spanish
def test_issue3803():
"""Test that spanish num-like tokens have True for like_num attribute."""
nlp = Spanish()
text = "2 dos 1000 mil 12 doce"
doc = nlp(text)
assert [t.like_num for t in doc] == [True, True, True, True, True, True]

View File

@ -1,34 +0,0 @@
from spacy.pipeline.pipes import DependencyParser
from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
assert "subtok" not in parser.labels
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {
"learn_tokens": True,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
assert "subtok" in parser.labels

View File

@ -1,18 +0,0 @@
from spacy.matcher import Matcher
from spacy.tokens import Doc
def test_issue3839(en_vocab):
"""Test that match IDs returned by the matcher are correct, are in the string """
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
matcher = Matcher(en_vocab)
match_id = "PATTERN"
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
matcher.add(match_id, [pattern1])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
matcher = Matcher(en_vocab)
matcher.add(match_id, [pattern2])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]

View File

@ -1,25 +0,0 @@
import pytest
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
@pytest.mark.parametrize(
"sentence",
[
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
],
)
def test_issue3869(sentence):
"""Test that the Doc's count_by function works consistently"""
nlp = English()
doc = nlp(sentence)
count = 0
for token in doc:
count += token.is_alpha
assert count == doc.count_by(IS_ALPHA).get(1, 0)

View File

@ -1,11 +0,0 @@
from spacy.matcher import Matcher
from spacy.tokens import Doc
def test_issue3879(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
assert len(doc) == 5
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
matcher = Matcher(en_vocab)
matcher.add("TEST", [pattern])
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'

View File

@ -1,21 +0,0 @@
from spacy.lang.en import English
import pytest
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch.
Fixed in v7.0.5 of Thinc.
"""
texts = ["hello", "world", "", ""]
nlp = English()
nlp.add_pipe(nlp.create_pipe("parser"))
nlp.add_pipe(nlp.create_pipe("ner"))
nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.get_pipe("parser").add_label("dep")
nlp.get_pipe("ner").add_label("PERSON")
nlp.get_pipe("tagger").add_label("NN")
nlp.begin_training()
for doc in nlp.pipe(texts):
pass

View File

@ -1,12 +0,0 @@
from spacy.displacy import parse_deps
from spacy.tokens import Doc
def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc.
"""
doc = Doc(en_vocab, words=["Hello", "world"])
doc.is_parsed = True
doc.user_data["test"] = set()
parse_deps(doc)

View File

@ -1,17 +0,0 @@
from spacy.matcher import Matcher
from spacy.tokens import Doc
def test_issue3951(en_vocab):
"""Test that combinations of optional rules are matched correctly."""
matcher = Matcher(en_vocab)
pattern = [
{"LOWER": "hello"},
{"LOWER": "this", "OP": "?"},
{"OP": "?"},
{"LOWER": "world"},
]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
matches = matcher(doc)
assert len(matches) == 0

View File

@ -1,26 +0,0 @@
from spacy.lang.en import English
from ..util import make_tempdir
def test_issue3959():
""" Ensure that a modified pos attribute is serialized correctly."""
nlp = English()
doc = nlp(
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
)
assert doc[0].pos_ == ""
doc[0].pos_ = "NOUN"
assert doc[0].pos_ == "NOUN"
# usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
with make_tempdir() as tmp_dir:
file_path = tmp_dir / "my_doc"
doc.to_disk(file_path)
doc2 = nlp("")
doc2.from_disk(file_path)
assert doc2[0].pos_ == "NOUN"

View File

@ -1,117 +0,0 @@
import pytest
from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
text = "He jests at scars, that never felt a wound."
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
deps = [
"nsubj",
"ccomp",
"prep",
"pobj",
"punct",
"nsubj",
"neg",
"ROOT",
"det",
"dobj",
"punct",
]
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_issue3962(doc):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
span2 = doc[1:5] # "jests at scars ,"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root
assert doc2[0].dep_ == "dep"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests" # head set to the new artificial root
assert doc2[3].dep_ == "dep"
# We should still have 1 sentence
assert len(list(doc2.sents)) == 1
span3 = doc[6:9] # "never felt a"
doc3 = span3.as_doc()
doc3_json = doc3.to_json()
assert doc3_json
assert doc3[0].head.text == "felt"
assert doc3[0].dep_ == "neg"
assert doc3[1].head.text == "felt"
assert doc3[1].dep_ == "ROOT"
assert doc3[2].head.text == "felt" # head set to ancestor
assert doc3[2].dep_ == "dep"
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len(list(doc3.sents)) == 1
@pytest.fixture
def two_sent_doc(en_tokenizer):
text = "He jests at scars. They never felt a wound."
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
deps = [
"nsubj",
"ROOT",
"prep",
"pobj",
"punct",
"nsubj",
"neg",
"ROOT",
"det",
"dobj",
"punct",
]
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_issue3962_long(two_sent_doc):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
span2 = two_sent_doc[1:7] # "jests at scars. They never"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root (in sentence 1)
assert doc2[0].dep_ == "ROOT"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests"
assert doc2[3].dep_ == "punct"
assert (
doc2[4].head.text == "They"
) # head set to itself, being the new artificial root (in sentence 2)
assert doc2[4].dep_ == "dep"
assert (
doc2[4].head.text == "They"
) # head set to the new artificial head (in sentence 2)
assert doc2[4].dep_ == "dep"
# We should still have 2 sentences
sents = list(doc2.sents)
assert len(sents) == 2
assert sents[0].text == "jests at scars ."
assert sents[1].text == "They never"

View File

@ -1,19 +0,0 @@
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
"""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
matches = matcher(doc)
assert len(matches) == 2
# We should have a match for each of the two rules
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
assert "A" in found_ids
assert "B" in found_ids

View File

@ -0,0 +1,469 @@
import pytest
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
from spacy.pipeline.defaults import default_ner
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span, DocBin
from spacy.gold import Example, Corpus
from spacy.gold.converters import json2docs
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.util import minibatch, ensure_path, load_model
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from spacy.tokenizer import Tokenizer
from spacy.lang.el import Greek
from spacy.language import Language
import spacy
from thinc.api import compounding
from collections import defaultdict
from ..util import make_tempdir
def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
"""
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"]
matcher.add("TEST", [pattern1])
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
matches = matcher(doc)
assert len(matches) == 1
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern2 = Doc(en_vocab, words=["1", "2"])
pattern2[0].norm_ = "c"
pattern2[1].norm_ = "d"
assert [t.norm_ for t in pattern2] == ["c", "d"]
matcher.add("TEST", [pattern2])
matches = matcher(doc)
assert len(matches) == 1
def test_issue4030():
""" Test whether textcat works fine with empty doc """
unique_classes = ["offensive", "inoffensive"]
x_train = [
"This is an offensive text",
"This is the second offensive text",
"inoff",
]
y_train = ["offensive", "offensive", "inoffensive"]
nlp = spacy.blank("en")
# preparing the data
train_data = []
for text, train_instance in zip(x_train, y_train):
cat_dict = {label: label == train_instance for label in unique_classes}
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
# add a text categorizer component
textcat = nlp.create_pipe(
"textcat",
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
)
for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
# processing of an empty doc should result in 0.0 for all categories
doc = nlp("")
assert doc.cats["offensive"] == 0.0
assert doc.cats["inoffensive"] == 0.0
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine."""
nlp = English()
# add ner pipe
ner = nlp.create_pipe("ner")
ner.add_label("SOME_LABEL")
nlp.add_pipe(ner)
nlp.begin_training()
# Add entity ruler
ruler = EntityRuler(nlp)
patterns = [
{"label": "MY_ORG", "pattern": "Apple"},
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, before="ner") # works fine with "after"
doc1 = nlp("What do you think about Apple ?")
assert doc1.ents[0].label_ == "MY_ORG"
with make_tempdir() as d:
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
nlp2 = load_model(output_dir)
doc2 = nlp2("What do you think about Apple ?")
assert doc2.ents[0].label_ == "MY_ORG"
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4042_bug2():
"""
Test that serialization of an NER works fine when new labels were added.
This is the second bug of two bugs underlying the issue 4042.
"""
nlp1 = English()
vocab = nlp1.vocab
# add ner pipe
ner1 = nlp1.create_pipe("ner")
ner1.add_label("SOME_LABEL")
nlp1.add_pipe(ner1)
nlp1.begin_training()
# add a new label to the doc
doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1
assert "SOME_LABEL" in ner1.labels
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
doc1.ents = list(doc1.ents) + [apple_ent]
# reapply the NER - at this point it should resize itself
ner1(doc1)
assert len(ner1.labels) == 2
assert "SOME_LABEL" in ner1.labels
assert "MY_ORG" in ner1.labels
with make_tempdir() as d:
# assert IO goes fine
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point."""
nlp1 = English()
vocab1 = nlp1.vocab
with make_tempdir() as d:
vocab_dir = ensure_path(d / "vocab")
if not vocab_dir.exists():
vocab_dir.mkdir()
vocab1.to_disk(vocab_dir)
vocab2 = Vocab().from_disk(vocab_dir)
print("lang", vocab2.lang)
nlp2 = spacy.blank("en", vocab=vocab2)
nlp_dir = ensure_path(d / "nlp")
if not nlp_dir.exists():
nlp_dir.mkdir()
nlp2.to_disk(nlp_dir)
nlp3 = load_model(nlp_dir)
assert nlp3.lang == "en"
def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
doc1 = Doc(en_vocab, words=["a"])
assert len(matcher(doc1)) == 1 # works
doc2 = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc2)) == 2 # fixed
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc3)) == 2 # works
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc4)) == 3 # fixed
def test_issue4133(en_vocab):
nlp = English()
vocab_bytes = nlp.vocab.to_bytes()
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
doc = Doc(en_vocab, words=words)
for i, token in enumerate(doc):
token.pos_ = pos[i]
# usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
doc_bytes = doc.to_bytes()
vocab = Vocab()
vocab = vocab.from_bytes(vocab_bytes)
doc = Doc(vocab).from_bytes(doc_bytes)
actual = []
for token in doc:
actual.append(token.pos_)
assert actual == pos
def test_issue4190():
def customize_tokenizer(nlp):
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = compile_infix_regex(nlp.Defaults.infixes)
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
exceptions = {
k: v
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
if not (len(k) == 2 and k[1] == ".")
}
new_tokenizer = Tokenizer(
nlp.vocab,
exceptions,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
)
nlp.tokenizer = new_tokenizer
test_string = "Test c."
# Load default language
nlp_1 = English()
doc_1a = nlp_1(test_string)
result_1a = [token.text for token in doc_1a] # noqa: F841
# Modify tokenizer
customize_tokenizer(nlp_1)
doc_1b = nlp_1(test_string)
result_1b = [token.text for token in doc_1b]
# Save and Reload
with make_tempdir() as model_dir:
nlp_1.to_disk(model_dir)
nlp_2 = load_model(model_dir)
# This should be the modified tokenizer
doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2]
assert result_1b == result_2
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4267():
""" Test that running an entity_ruler after ner gives consistent results"""
nlp = English()
ner = nlp.create_pipe("ner")
ner.add_label("PEOPLE")
nlp.add_pipe(ner)
nlp.begin_training()
assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations
doc1 = nlp("hi")
assert doc1.is_nered
for token in doc1:
assert token.ent_iob == 2
# add entity ruler and run again
ruler = EntityRuler(nlp)
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
assert "entity_ruler" in nlp.pipe_names
assert "ner" in nlp.pipe_names
# assert that we still have correct IOB annotations
doc2 = nlp("hi")
assert doc2.is_nered
for token in doc2:
assert token.ent_iob == 2
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""
nlp = Greek()
doc = nlp("Χθες")
assert doc[0].lemma_
def test_multiple_predictions():
class DummyPipe(Pipe):
def __init__(self):
self.model = "dummy_model"
def predict(self, docs):
return ([1, 2, 3], [4, 5, 6])
def set_annotations(self, docs, scores):
return docs
nlp = Language()
doc = nlp.make_doc("foo")
dummy_pipe = DummyPipe()
dummy_pipe(doc)
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
def test_issue4313():
""" This should not crash or exit with some strange error code """
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL")
ner.begin_training([])
nlp.add_pipe(ner)
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
assert "SOME_LABEL" in ner.labels
apple_ent = Span(doc, 5, 6, label="MY_ORG")
doc.ents = list(doc.ents) + [apple_ent]
# ensure the beam_parse still works with the new label
docs = [doc]
beams = nlp.entity.beam_parse(
docs, beam_width=beam_width, beam_density=beam_density
)
for doc, beam in zip(docs, beams):
entity_scores = defaultdict(float)
for score, ents in nlp.entity.moves.get_beam_parses(beam):
for start, end, label in ents:
entity_scores[(start, end, label)] += score
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors"""
nlp = English()
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
TRAIN_DATA = [example, example]
tagger = nlp.create_pipe("tagger")
nlp.add_pipe(tagger)
optimizer = nlp.begin_training()
for i in range(5):
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)
def test_issue4367():
"""Test that docbin init goes well"""
DocBin()
DocBin(attrs=["LEMMA"])
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
matcher = PhraseMatcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
def test_issue4402():
json_data = {
"id": 0,
"paragraphs": [
{
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "How", "ner": "O"},
{"id": 1, "orth": "should", "ner": "O"},
{"id": 2, "orth": "I", "ner": "O"},
{"id": 3, "orth": "cook", "ner": "O"},
{"id": 4, "orth": "bacon", "ner": "O"},
{"id": 5, "orth": "in", "ner": "O"},
{"id": 6, "orth": "an", "ner": "O"},
{"id": 7, "orth": "oven", "ner": "O"},
{"id": 8, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{
"tokens": [
{"id": 9, "orth": "\n", "ner": "O"},
{"id": 10, "orth": "I", "ner": "O"},
{"id": 11, "orth": "'ve", "ner": "O"},
{"id": 12, "orth": "heard", "ner": "O"},
{"id": 13, "orth": "of", "ner": "O"},
{"id": 14, "orth": "people", "ner": "O"},
{"id": 15, "orth": "cooking", "ner": "O"},
{"id": 16, "orth": "bacon", "ner": "O"},
{"id": 17, "orth": "in", "ner": "O"},
{"id": 18, "orth": "an", "ner": "O"},
{"id": 19, "orth": "oven", "ner": "O"},
{"id": 20, "orth": ".", "ner": "O"},
],
"brackets": [],
},
],
"cats": [
{"label": "baking", "value": 1.0},
{"label": "not_baking", "value": 0.0},
],
},
{
"raw": "What is the difference between white and brown eggs?\n",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "What", "ner": "O"},
{"id": 1, "orth": "is", "ner": "O"},
{"id": 2, "orth": "the", "ner": "O"},
{"id": 3, "orth": "difference", "ner": "O"},
{"id": 4, "orth": "between", "ner": "O"},
{"id": 5, "orth": "white", "ner": "O"},
{"id": 6, "orth": "and", "ner": "O"},
{"id": 7, "orth": "brown", "ner": "O"},
{"id": 8, "orth": "eggs", "ner": "O"},
{"id": 9, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
],
"cats": [
{"label": "baking", "value": 0.0},
{"label": "not_baking", "value": 1.0},
],
},
],
}
nlp = English()
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
with make_tempdir() as tmpdir:
output_file = tmpdir / "test4402.spacy"
docs = json2docs([json_data])
data = DocBin(docs=docs, attrs=attrs).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
train_data = list(corpus.train_dataset(nlp))
assert len(train_data) == 2
split_train_data = []
for eg in train_data:
split_train_data.extend(eg.split_sents())
assert len(split_train_data) == 4

View File

@ -1,23 +0,0 @@
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
"""
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"]
matcher.add("TEST", [pattern1])
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
matches = matcher(doc)
assert len(matches) == 1
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern2 = Doc(en_vocab, words=["1", "2"])
pattern2[0].norm_ = "c"
pattern2[1].norm_ = "d"
assert [t.norm_ for t in pattern2] == ["c", "d"]
matcher.add("TEST", [pattern2])
matches = matcher(doc)
assert len(matches) == 1

View File

@ -1,50 +0,0 @@
import spacy
from spacy.util import minibatch
from thinc.api import compounding
from spacy.gold import Example
def test_issue4030():
""" Test whether textcat works fine with empty doc """
unique_classes = ["offensive", "inoffensive"]
x_train = [
"This is an offensive text",
"This is the second offensive text",
"inoff",
]
y_train = ["offensive", "offensive", "inoffensive"]
nlp = spacy.blank("en")
# preparing the data
train_data = []
for text, train_instance in zip(x_train, y_train):
cat_dict = {label: label == train_instance for label in unique_classes}
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
# add a text categorizer component
textcat = nlp.create_pipe(
"textcat",
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
)
for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
# processing of an empty doc should result in 0.0 for all categories
doc = nlp("")
assert doc.cats["offensive"] == 0.0
assert doc.cats["inoffensive"] == 0.0

View File

@ -1,85 +0,0 @@
import spacy
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.util import ensure_path
from spacy.pipeline.defaults import default_ner
from ..util import make_tempdir
def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine."""
nlp = English()
# add ner pipe
ner = nlp.create_pipe("ner")
ner.add_label("SOME_LABEL")
nlp.add_pipe(ner)
nlp.begin_training()
# Add entity ruler
ruler = EntityRuler(nlp)
patterns = [
{"label": "MY_ORG", "pattern": "Apple"},
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, before="ner") # works fine with "after"
doc1 = nlp("What do you think about Apple ?")
assert doc1.ents[0].label_ == "MY_ORG"
with make_tempdir() as d:
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2("What do you think about Apple ?")
assert doc2.ents[0].label_ == "MY_ORG"
def test_issue4042_bug2():
"""
Test that serialization of an NER works fine when new labels were added.
This is the second bug of two bugs underlying the issue 4042.
"""
nlp1 = English()
vocab = nlp1.vocab
# add ner pipe
ner1 = nlp1.create_pipe("ner")
ner1.add_label("SOME_LABEL")
nlp1.add_pipe(ner1)
nlp1.begin_training()
# add a new label to the doc
doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1
assert "SOME_LABEL" in ner1.labels
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
doc1.ents = list(doc1.ents) + [apple_ent]
# reapply the NER - at this point it should resize itself
ner1(doc1)
assert len(ner1.labels) == 2
assert "SOME_LABEL" in ner1.labels
assert "MY_ORG" in ner1.labels
with make_tempdir() as d:
# assert IO goes fine
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2

View File

@ -1,30 +0,0 @@
from spacy.vocab import Vocab
import spacy
from spacy.lang.en import English
from spacy.util import ensure_path
from ..util import make_tempdir
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point."""
nlp1 = English()
vocab1 = nlp1.vocab
with make_tempdir() as d:
vocab_dir = ensure_path(d / "vocab")
if not vocab_dir.exists():
vocab_dir.mkdir()
vocab1.to_disk(vocab_dir)
vocab2 = Vocab().from_disk(vocab_dir)
print("lang", vocab2.lang)
nlp2 = spacy.blank("en", vocab=vocab2)
nlp_dir = ensure_path(d / "nlp")
if not nlp_dir.exists():
nlp_dir.mkdir()
nlp2.to_disk(nlp_dir)
nlp3 = spacy.load(nlp_dir)
assert nlp3.lang == "en"

View File

@ -1,23 +0,0 @@
from spacy.matcher import Matcher
from spacy.tokens import Doc
def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
doc1 = Doc(en_vocab, words=["a"])
assert len(matcher(doc1)) == 1 # works
doc2 = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc2)) == 2 # fixed
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc3)) == 2 # works
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc4)) == 3 # fixed

View File

@ -1,28 +0,0 @@
from spacy.lang.en import English
from spacy.tokens import Doc
from spacy.vocab import Vocab
def test_issue4133(en_vocab):
nlp = English()
vocab_bytes = nlp.vocab.to_bytes()
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
doc = Doc(en_vocab, words=words)
for i, token in enumerate(doc):
token.pos_ = pos[i]
# usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
doc_bytes = doc.to_bytes()
vocab = Vocab()
vocab = vocab.from_bytes(vocab_bytes)
doc = Doc(vocab).from_bytes(doc_bytes)
actual = []
for token in doc:
actual.append(token.pos_)
assert actual == pos

View File

@ -1,46 +0,0 @@
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy import util
from ..util import make_tempdir
def test_issue4190():
test_string = "Test c."
# Load default language
nlp_1 = English()
doc_1a = nlp_1(test_string)
result_1a = [token.text for token in doc_1a] # noqa: F841
# Modify tokenizer
customize_tokenizer(nlp_1)
doc_1b = nlp_1(test_string)
result_1b = [token.text for token in doc_1b]
# Save and Reload
with make_tempdir() as model_dir:
nlp_1.to_disk(model_dir)
nlp_2 = util.load_model(model_dir)
# This should be the modified tokenizer
doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2]
assert result_1b == result_2
def customize_tokenizer(nlp):
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
exceptions = {
k: v
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
if not (len(k) == 2 and k[1] == ".")
}
new_tokenizer = Tokenizer(
nlp.vocab,
exceptions,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
)
nlp.tokenizer = new_tokenizer

View File

@ -1,34 +0,0 @@
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
def test_issue4267():
""" Test that running an entity_ruler after ner gives consistent results"""
nlp = English()
ner = nlp.create_pipe("ner")
ner.add_label("PEOPLE")
nlp.add_pipe(ner)
nlp.begin_training()
assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations
doc1 = nlp("hi")
assert doc1.is_nered
for token in doc1:
assert token.ent_iob == 2
# add entity ruler and run again
ruler = EntityRuler(nlp)
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
assert "entity_ruler" in nlp.pipe_names
assert "ner" in nlp.pipe_names
# assert that we still have correct IOB annotations
doc2 = nlp("hi")
assert doc2.is_nered
for token in doc2:
assert token.ent_iob == 2

View File

@ -1,9 +0,0 @@
from spacy.lang.el import Greek
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""
nlp = Greek()
doc = nlp("Χθες")
assert doc[0].lemma_

View File

@ -1,25 +0,0 @@
import pytest
from spacy.language import Language
from spacy.pipeline import Pipe
class DummyPipe(Pipe):
def __init__(self):
self.model = "dummy_model"
def predict(self, docs):
return ([1, 2, 3], [4, 5, 6])
def set_annotations(self, docs, scores, tensors=None):
return docs
@pytest.fixture
def nlp():
return Language()
def test_multiple_predictions(nlp):
doc = nlp.make_doc("foo")
dummy_pipe = DummyPipe()
dummy_pipe(doc)

View File

@ -1,47 +0,0 @@
from collections import defaultdict
import pytest
from spacy.pipeline.defaults import default_ner
from spacy.pipeline import EntityRecognizer
from spacy.lang.en import English
from spacy.tokens import Span
# skipped after removing Beam stuff during the Example/GoldParse refactor
@pytest.mark.skip
def test_issue4313():
""" This should not crash or exit with some strange error code """
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL")
ner.begin_training([])
nlp.add_pipe(ner)
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
assert "SOME_LABEL" in ner.labels
apple_ent = Span(doc, 5, 6, label="MY_ORG")
doc.ents = list(doc.ents) + [apple_ent]
# ensure the beam_parse still works with the new label
docs = [doc]
beams = nlp.entity.beam_parse(
docs, beam_width=beam_width, beam_density=beam_density
)
for doc, beam in zip(docs, beams):
entity_scores = defaultdict(float)
for score, ents in nlp.entity.moves.get_beam_parses(beam):
for start, end, label in ents:
entity_scores[(start, end, label)] += score

View File

@ -1,24 +0,0 @@
from spacy.gold import Example
from spacy.lang.en import English
from spacy.util import minibatch
from thinc.api import compounding
import pytest
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors"""
nlp = English()
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
TRAIN_DATA = [example, example]
tagger = nlp.create_pipe("tagger")
nlp.add_pipe(tagger)
optimizer = nlp.begin_training()
for i in range(5):
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)

View File

@ -1,8 +0,0 @@
from spacy.tokens import DocBin
def test_issue4367():
"""Test that docbin init goes well"""
DocBin()
DocBin(attrs=["LEMMA"])
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])

View File

@ -1,10 +0,0 @@
from spacy.matcher import Matcher, PhraseMatcher
from spacy.vocab import Vocab
def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
matcher = PhraseMatcher(Vocab())
assert isinstance(matcher.vocab, Vocab)

View File

@ -1,98 +0,0 @@
from spacy.gold import Corpus
from spacy.lang.en import English
from ..util import make_tempdir
from ...gold.converters import json2docs
from ...tokens import DocBin
def test_issue4402():
nlp = English()
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
with make_tempdir() as tmpdir:
output_file = tmpdir / "test4402.spacy"
docs = json2docs([json_data])
data = DocBin(docs=docs, attrs=attrs).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
train_data = list(corpus.train_dataset(nlp))
assert len(train_data) == 2
split_train_data = []
for eg in train_data:
split_train_data.extend(eg.split_sents())
assert len(split_train_data) == 4
json_data = {
"id": 0,
"paragraphs": [
{
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "How", "ner": "O"},
{"id": 1, "orth": "should", "ner": "O"},
{"id": 2, "orth": "I", "ner": "O"},
{"id": 3, "orth": "cook", "ner": "O"},
{"id": 4, "orth": "bacon", "ner": "O"},
{"id": 5, "orth": "in", "ner": "O"},
{"id": 6, "orth": "an", "ner": "O"},
{"id": 7, "orth": "oven", "ner": "O"},
{"id": 8, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{
"tokens": [
{"id": 9, "orth": "\n", "ner": "O"},
{"id": 10, "orth": "I", "ner": "O"},
{"id": 11, "orth": "'ve", "ner": "O"},
{"id": 12, "orth": "heard", "ner": "O"},
{"id": 13, "orth": "of", "ner": "O"},
{"id": 14, "orth": "people", "ner": "O"},
{"id": 15, "orth": "cooking", "ner": "O"},
{"id": 16, "orth": "bacon", "ner": "O"},
{"id": 17, "orth": "in", "ner": "O"},
{"id": 18, "orth": "an", "ner": "O"},
{"id": 19, "orth": "oven", "ner": "O"},
{"id": 20, "orth": ".", "ner": "O"},
],
"brackets": [],
},
],
"cats": [
{"label": "baking", "value": 1.0},
{"label": "not_baking", "value": 0.0},
],
},
{
"raw": "What is the difference between white and brown eggs?\n",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "What", "ner": "O"},
{"id": 1, "orth": "is", "ner": "O"},
{"id": 2, "orth": "the", "ner": "O"},
{"id": 3, "orth": "difference", "ner": "O"},
{"id": 4, "orth": "between", "ner": "O"},
{"id": 5, "orth": "white", "ner": "O"},
{"id": 6, "orth": "and", "ner": "O"},
{"id": 7, "orth": "brown", "ner": "O"},
{"id": 8, "orth": "eggs", "ner": "O"},
{"id": 9, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
],
"cats": [
{"label": "baking", "value": 0.0},
{"label": "not_baking", "value": 1.0},
],
},
],
}

View File

@ -0,0 +1,288 @@
import pytest
from mock import Mock
from spacy.pipeline import EntityRuler
from spacy.matcher import DependencyMatcher
from spacy.tokens import Doc, Span, DocBin
from spacy.gold import Example
from spacy.gold.converters.conllu2docs import conllu2docs
from spacy.lang.en import English
from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.util import ensure_path, load_model_from_path
import numpy
import pickle
from ..util import get_doc, make_tempdir
def test_issue4528(en_vocab):
"""Test that user_data is correctly serialized in DocBin."""
doc = Doc(en_vocab, words=["hello", "world"])
doc.user_data["foo"] = "bar"
# This is how extension attribute values are stored in the user data
doc.user_data[("._.", "foo", None, None)] = "bar"
doc_bin = DocBin(store_user_data=True)
doc_bin.add(doc)
doc_bin_bytes = doc_bin.to_bytes()
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
assert new_doc.user_data["foo"] == "bar"
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
@pytest.mark.parametrize(
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
)
def test_gold_misaligned(en_tokenizer, text, words):
doc = en_tokenizer(text)
Example.from_dict(doc, {"words": words})
def test_issue4590(en_vocab):
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
pattern = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
]
on_match = Mock()
matcher = DependencyMatcher(en_vocab)
matcher.add("pattern", on_match, pattern)
text = "The quick brown fox jumped over the lazy fox"
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
matches = matcher(doc)
on_match_args = on_match.call_args
assert on_match_args[0][3] == matches
def test_issue4651_with_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
specified.
"""
text = "Spacy is a python library for nlp"
nlp = English()
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
nlp_reloaded = English()
with make_tempdir() as d:
file_path = d / "entityruler"
ruler.to_disk(file_path)
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
nlp_reloaded.add_pipe(ruler_reloaded)
doc_reloaded = nlp_reloaded(text)
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
assert res == res_reloaded
def test_issue4651_without_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
not specified.
"""
text = "Spacy is a python library for nlp"
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
nlp_reloaded = English()
with make_tempdir() as d:
file_path = d / "entityruler"
ruler.to_disk(file_path)
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
nlp_reloaded.add_pipe(ruler_reloaded)
doc_reloaded = nlp_reloaded(text)
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
assert res == res_reloaded
def test_issue4665():
"""
conllu2json should not raise an exception if the HEAD column contains an
underscore
"""
input_data = """
1 [ _ PUNCT -LRB- _ _ punct _ _
2 This _ DET DT _ _ det _ _
3 killing _ NOUN NN _ _ nsubj _ _
4 of _ ADP IN _ _ case _ _
5 a _ DET DT _ _ det _ _
6 respected _ ADJ JJ _ _ amod _ _
7 cleric _ NOUN NN _ _ nmod _ _
8 will _ AUX MD _ _ aux _ _
9 be _ AUX VB _ _ aux _ _
10 causing _ VERB VBG _ _ root _ _
11 us _ PRON PRP _ _ iobj _ _
12 trouble _ NOUN NN _ _ dobj _ _
13 for _ ADP IN _ _ case _ _
14 years _ NOUN NNS _ _ nmod _ _
15 to _ PART TO _ _ mark _ _
16 come _ VERB VB _ _ acl _ _
17 . _ PUNCT . _ _ punct _ _
18 ] _ PUNCT -RRB- _ _ punct _ _
"""
conllu2docs(input_data)
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
with pytest.warns(UserWarning):
kb.set_entities(
entity_list=["Q1", "Q1"],
freq_list=[32, 111],
vector_list=[vector1, vector2],
)
assert kb.get_size_entities() == 1
# dumping to file & loading back in
with make_tempdir() as d:
dir_path = ensure_path(d)
if not dir_path.exists():
dir_path.mkdir()
file_path = dir_path / "kb"
kb.dump(str(file_path))
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
kb2.load_bulk(str(file_path))
assert kb2.get_size_entities() == 1
def test_issue4707():
"""Tests that disabled component names are also excluded from nlp.from_disk
by default when loading a model.
"""
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
exclude = ["tokenizer", "sentencizer"]
with make_tempdir() as tmpdir:
nlp.to_disk(tmpdir, exclude=exclude)
new_nlp = load_model_from_path(tmpdir, disable=exclude)
assert "sentencizer" not in new_nlp.pipe_names
assert "entity_ruler" in new_nlp.pipe_names
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4725_1():
""" Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector")
nlp = English(vocab=vocab)
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
with make_tempdir() as tmp_path:
with (tmp_path / "ner.pkl").open("wb") as file_:
pickle.dump(ner, file_)
assert ner.cfg["min_action_freq"] == 342
with (tmp_path / "ner.pkl").open("rb") as file_:
ner2 = pickle.load(file_)
assert ner2.cfg["min_action_freq"] == 342
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4725_2():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
vocab = Vocab(vectors_name="test_vocab_add_vector")
data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0
data[1] = 2.0
vocab.set_vector("cat", data[0])
vocab.set_vector("dog", data[1])
nlp = English(vocab=vocab)
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
nlp.begin_training()
docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass
def test_issue4849():
nlp = English()
ruler = EntityRuler(
nlp,
patterns=[
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
],
phrase_matcher_attr="LOWER",
)
nlp.add_pipe(ruler)
text = """
The left is starting to take aim at Democratic front-runner Joe Biden.
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
"""
# USING 1 PROCESS
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert count_ents == 2
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert count_ents == 2
class CustomPipe:
name = "my_pipe"
def __init__(self):
Span.set_extension("my_ext", getter=self._get_my_ext)
Doc.set_extension("my_ext", default=None)
def __call__(self, doc):
gathered_ext = []
for sent in doc.sents:
sent_ext = self._get_my_ext(sent)
sent._.set("my_ext", sent_ext)
gathered_ext.append(sent_ext)
doc._.set("my_ext", "\n".join(gathered_ext))
return doc
@staticmethod
def _get_my_ext(span):
return str(span.end)
def test_issue4903():
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
macOS."""
nlp = English()
custom_component = CustomPipe()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(custom_component, after="sentencizer")
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
docs = list(nlp.pipe(text, n_process=2))
assert docs[0].text == "I like bananas."
assert docs[1].text == "Do you like them?"
assert docs[2].text == "No, I prefer wasabi."
def test_issue4924():
nlp = Language()
example = Example.from_dict(nlp.make_doc(""), {})
nlp.evaluate([example])

View File

@ -1,16 +0,0 @@
from spacy.tokens import Doc, DocBin
def test_issue4528(en_vocab):
"""Test that user_data is correctly serialized in DocBin."""
doc = Doc(en_vocab, words=["hello", "world"])
doc.user_data["foo"] = "bar"
# This is how extension attribute values are stored in the user data
doc.user_data[("._.", "foo", None, None)] = "bar"
doc_bin = DocBin(store_user_data=True)
doc_bin.add(doc)
doc_bin_bytes = doc_bin.to_bytes()
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
assert new_doc.user_data["foo"] == "bar"
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"

View File

@ -1,11 +0,0 @@
import pytest
from spacy.gold import Example
@pytest.mark.parametrize(
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
)
def test_gold_misaligned(en_tokenizer, text, words):
doc = en_tokenizer(text)
Example.from_dict(doc, {"words": words})

View File

@ -1,35 +0,0 @@
from mock import Mock
from spacy.matcher import DependencyMatcher
from ..util import get_doc
def test_issue4590(en_vocab):
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
pattern = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
]
on_match = Mock()
matcher = DependencyMatcher(en_vocab)
matcher.add("pattern", on_match, pattern)
text = "The quick brown fox jumped over the lazy fox"
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
matches = matcher(doc)
on_match_args = on_match.call_args
assert on_match_args[0][3] == matches

View File

@ -1,62 +0,0 @@
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from ..util import make_tempdir
def test_issue4651_with_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
specified.
"""
text = "Spacy is a python library for nlp"
nlp = English()
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
nlp_reloaded = English()
with make_tempdir() as d:
file_path = d / "entityruler"
ruler.to_disk(file_path)
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
nlp_reloaded.add_pipe(ruler_reloaded)
doc_reloaded = nlp_reloaded(text)
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
assert res == res_reloaded
def test_issue4651_without_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
not specified.
"""
text = "Spacy is a python library for nlp"
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
nlp_reloaded = English()
with make_tempdir() as d:
file_path = d / "entityruler"
ruler.to_disk(file_path)
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
nlp_reloaded.add_pipe(ruler_reloaded)
doc_reloaded = nlp_reloaded(text)
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
assert res == res_reloaded

View File

@ -1,35 +0,0 @@
import pytest
# TODO
# from spacy.gold.converters.conllu2docs import conllu2docs
input_data = """
1 [ _ PUNCT -LRB- _ _ punct _ _
2 This _ DET DT _ _ det _ _
3 killing _ NOUN NN _ _ nsubj _ _
4 of _ ADP IN _ _ case _ _
5 a _ DET DT _ _ det _ _
6 respected _ ADJ JJ _ _ amod _ _
7 cleric _ NOUN NN _ _ nmod _ _
8 will _ AUX MD _ _ aux _ _
9 be _ AUX VB _ _ aux _ _
10 causing _ VERB VBG _ _ root _ _
11 us _ PRON PRP _ _ iobj _ _
12 trouble _ NOUN NN _ _ dobj _ _
13 for _ ADP IN _ _ case _ _
14 years _ NOUN NNS _ _ nmod _ _
15 to _ PART TO _ _ mark _ _
16 come _ VERB VB _ _ acl _ _
17 . _ PUNCT . _ _ punct _ _
18 ] _ PUNCT -RRB- _ _ punct _ _
"""
@pytest.mark.xfail
def test_issue4665():
"""
conllu2json should not raise an exception if the HEAD column contains an
underscore
"""
pass
# conllu2json(input_data)

View File

@ -1,36 +0,0 @@
import pytest
from spacy.kb import KnowledgeBase
from spacy.util import ensure_path
from spacy.lang.en import English
from ..util import make_tempdir
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
with pytest.warns(UserWarning):
kb.set_entities(
entity_list=["Q1", "Q1"],
freq_list=[32, 111],
vector_list=[vector1, vector2],
)
assert kb.get_size_entities() == 1
# dumping to file & loading back in
with make_tempdir() as d:
dir_path = ensure_path(d)
if not dir_path.exists():
dir_path.mkdir()
file_path = dir_path / "kb"
kb.dump(str(file_path))
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
kb2.load_bulk(str(file_path))
assert kb2.get_size_entities() == 1

View File

@ -1,20 +0,0 @@
from spacy.util import load_model_from_path
from spacy.lang.en import English
from ..util import make_tempdir
def test_issue4707():
"""Tests that disabled component names are also excluded from nlp.from_disk
by default when loading a model.
"""
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
exclude = ["tokenizer", "sentencizer"]
with make_tempdir() as tmpdir:
nlp.to_disk(tmpdir, exclude=exclude)
new_nlp = load_model_from_path(tmpdir, disable=exclude)
assert "sentencizer" not in new_nlp.pipe_names
assert "entity_ruler" in new_nlp.pipe_names

View File

@ -1,41 +0,0 @@
import pickle
import numpy
from spacy.lang.en import English
from spacy.vocab import Vocab
from spacy.tests.util import make_tempdir
def test_pickle_ner():
""" Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector")
nlp = English(vocab=vocab)
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
with make_tempdir() as tmp_path:
with (tmp_path / "ner.pkl").open("wb") as file_:
pickle.dump(ner, file_)
assert ner.cfg["min_action_freq"] == 342
with (tmp_path / "ner.pkl").open("rb") as file_:
ner2 = pickle.load(file_)
assert ner2.cfg["min_action_freq"] == 342
def test_issue4725():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
vocab = Vocab(vectors_name="test_vocab_add_vector")
data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0
data[1] = 2.0
vocab.set_vector("cat", data[0])
vocab.set_vector("dog", data[1])
nlp = English(vocab=vocab)
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
nlp.begin_training()
docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass

View File

@ -1,34 +0,0 @@
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
def test_issue4849():
nlp = English()
ruler = EntityRuler(
nlp,
patterns=[
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
],
phrase_matcher_attr="LOWER",
)
nlp.add_pipe(ruler)
text = """
The left is starting to take aim at Democratic front-runner Joe Biden.
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
"""
# USING 1 PROCESS
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert count_ents == 2
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert count_ents == 2

View File

@ -1,40 +0,0 @@
from spacy.lang.en import English
from spacy.tokens import Span, Doc
class CustomPipe:
name = "my_pipe"
def __init__(self):
Span.set_extension("my_ext", getter=self._get_my_ext)
Doc.set_extension("my_ext", default=None)
def __call__(self, doc):
gathered_ext = []
for sent in doc.sents:
sent_ext = self._get_my_ext(sent)
sent._.set("my_ext", sent_ext)
gathered_ext.append(sent_ext)
doc._.set("my_ext", "\n".join(gathered_ext))
return doc
@staticmethod
def _get_my_ext(span):
return str(span.end)
def test_issue4903():
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
nlp = English()
custom_component = CustomPipe()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(custom_component, after="sentencizer")
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
docs = list(nlp.pipe(text, n_process=2))
assert docs[0].text == "I like bananas."
assert docs[1].text == "Do you like them?"
assert docs[2].text == "No, I prefer wasabi."

View File

@ -1,8 +0,0 @@
from spacy.gold import Example
from spacy.language import Language
def test_issue4924():
nlp = Language()
example = Example.from_dict(nlp.make_doc(""), {})
nlp.evaluate([example])

View File

@ -1,6 +1,8 @@
import pytest
from spacy.lang.en import English from spacy.lang.en import English
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue5152(): def test_issue5152():
# Test that the comparison between a Span and a Token, goes well # Test that the comparison between a Span and a Token, goes well
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!) # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -8,7 +10,6 @@ def test_issue5152():
text = nlp("Talk about being boring!") text = nlp("Talk about being boring!")
text_var = nlp("Talk of being boring!") text_var = nlp("Talk of being boring!")
y = nlp("Let") y = nlp("Let")
span = text[0:3] # Talk about being span = text[0:3] # Talk about being
span_2 = text[0:3] # Talk about being span_2 = text[0:3] # Talk about being
span_3 = text_var[0:3] # Talk of being span_3 = text_var[0:3] # Talk of being

View File

@ -63,7 +63,8 @@ def tagger():
# need to add model for two reasons: # need to add model for two reasons:
# 1. no model leads to error in serialization, # 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization # 2. the affected line is the one for model serialization
tagger.begin_training(pipeline=nlp.pipeline) with pytest.warns(UserWarning):
tagger.begin_training(pipeline=nlp.pipeline)
return tagger return tagger

View File

@ -0,0 +1,31 @@
from spacy.lang.en import English
from spacy.util import fix_random_seed
def test_issue5551():
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
component = "textcat"
pipe_cfg = {"exclusive_classes": False}
results = []
for i in range(3):
fix_random_seed(0)
nlp = English()
example = (
"Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
{"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
)
nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True)
pipe = nlp.get_pipe(component)
for label in set(example[1]["cats"]):
pipe.add_label(label)
nlp.begin_training(component_cfg={component: pipe_cfg})
# Store the result of each iteration
result = pipe.model.predict([nlp.make_doc(example[0])])
results.append(list(result[0]))
# All results should be the same because of the fixed seed
assert len(results) == 3
assert results[0] == results[1]
assert results[0] == results[2]

View File

@ -1,3 +1,4 @@
import numpy
from spacy.errors import AlignmentError from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import spans_from_biluo_tags, iob_to_biluo
@ -5,6 +6,7 @@ from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
from spacy.gold.converters import json2docs from spacy.gold.converters import json2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, minibatch from spacy.util import get_words_and_spaces, minibatch
from thinc.api import compounding from thinc.api import compounding
@ -153,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab):
assert tags == ["O", "O", "O", "-", "-", "-"] assert tags == ["O", "O", "O", "-", "-", "-"]
def test_example_constructor(en_vocab):
words = ["I", "like", "stuff"]
tags = ["NOUN", "VERB", "NOUN"]
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
predicted = Doc(en_vocab, words=words)
reference = Doc(en_vocab, words=words)
reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
example = Example(predicted, reference)
tags = example.get_aligned("TAG", as_string=True)
assert tags == ["NOUN", "VERB", "NOUN"]
def test_example_from_dict_tags(en_vocab):
words = ["I", "like", "stuff"]
tags = ["NOUN", "VERB", "NOUN"]
predicted = Doc(en_vocab, words=words)
example = Example.from_dict(predicted, {"TAGS": tags})
tags = example.get_aligned("TAG", as_string=True)
assert tags == ["NOUN", "VERB", "NOUN"]
def test_example_from_dict_no_ner(en_vocab): def test_example_from_dict_no_ner(en_vocab):
words = ["a", "b", "c", "d"] words = ["a", "b", "c", "d"]
spaces = [True, True, False, True] spaces = [True, True, False, True]
@ -272,72 +295,72 @@ def test_split_sentences(en_vocab):
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
spaces = [True, True, True, False, False] spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "U-LOC", "O"] assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person (len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", None, "O", "U-LOC", "O"] assert ner_tags == ["O", None, "O", "U-LOC", "O"]
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
spaces = [True, True, True, True, True, True, True, False, False] spaces = [True, True, True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
def test_gold_biluo_misaligned(en_vocab, en_tokenizer): def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
spaces = [True, True, True, True, True, False, False] spaces = [True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
@ -407,6 +430,49 @@ def test_biluo_spans(en_tokenizer):
assert spans[1].label_ == "GPE" assert spans[1].label_ == "GPE"
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr and Mrs Smith flew to "
entities = [
(0, len("Mr and Mrs Smith"), "PERSON"),
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
]
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
ents_ref = example.reference.ents
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
text = "Mr and Mrs Smith flew to San Francisco Valley"
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
{"label": "LOC", "pattern": "San Francisco Valley"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
prefix = "Mr and Mrs Smith flew to "
entities = [
(0, len("Mr and Mrs Smith"), "PERSON"),
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
]
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
ents_pred = example.predicted.ents
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
def test_gold_ner_missing_tags(en_tokenizer): def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.") doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@ -414,6 +480,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
def test_projectivize(en_tokenizer):
doc = en_tokenizer("He pretty quickly walks away")
heads = [3, 2, 3, 0, 2]
example = Example.from_dict(doc, {"heads": heads})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
assert proj_heads == [3, 2, 3, 0, 3]
assert nonproj_heads == [3, 2, 3, 0, 2]
def test_iob_to_biluo(): def test_iob_to_biluo():
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]

156
spacy/tests/test_models.py Normal file
View File

@ -0,0 +1,156 @@
from typing import List
import pytest
from thinc.api import fix_random_seed, Adam, set_dropout_rate
from numpy.testing import assert_array_equal
import numpy
from spacy.ml.models import build_Tok2Vec_model
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
from spacy.lang.en import English
from spacy.lang.en.examples import sentences as EN_SENTENCES
def get_all_params(model):
params = []
for node in model.walk():
for name in node.param_names:
params.append(node.get_param(name).ravel())
return node.ops.xp.concatenate(params)
def get_docs():
nlp = English()
return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)]))
def get_gradient(model, Y):
if isinstance(Y, model.ops.xp.ndarray):
dY = model.ops.alloc(Y.shape, dtype=Y.dtype)
dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape)
return dY
elif isinstance(Y, List):
return [get_gradient(model, y) for y in Y]
else:
raise ValueError(f"Could not get gradient for type {type(Y)}")
def default_tok2vec():
return build_Tok2Vec_model(**TOK2VEC_KWARGS)
TOK2VEC_KWARGS = {
"width": 96,
"embed_size": 2000,
"subword_features": True,
"char_embed": False,
"conv_depth": 4,
"bilstm_depth": 0,
"maxout_pieces": 4,
"window_size": 1,
"dropout": 0.1,
"nM": 0,
"nC": 0,
"pretrained_vectors": None,
}
TEXTCAT_KWARGS = {
"width": 64,
"embed_size": 2000,
"pretrained_vectors": None,
"exclusive_classes": False,
"ngram_size": 1,
"window_size": 1,
"conv_depth": 2,
"dropout": None,
"nO": 7
}
TEXTCAT_CNN_KWARGS = {
"tok2vec": default_tok2vec(),
"exclusive_classes": False,
"nO": 13,
}
@pytest.mark.parametrize(
"seed,model_func,kwargs",
[
(0, build_Tok2Vec_model, TOK2VEC_KWARGS),
(0, build_text_classifier, TEXTCAT_KWARGS),
(0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS),
],
)
def test_models_initialize_consistently(seed, model_func, kwargs):
fix_random_seed(seed)
model1 = model_func(**kwargs)
model1.initialize()
fix_random_seed(seed)
model2 = model_func(**kwargs)
model2.initialize()
params1 = get_all_params(model1)
params2 = get_all_params(model2)
assert_array_equal(params1, params2)
@pytest.mark.parametrize(
"seed,model_func,kwargs,get_X",
[
(0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
(0, build_text_classifier, TEXTCAT_KWARGS, get_docs),
(0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
],
)
def test_models_predict_consistently(seed, model_func, kwargs, get_X):
fix_random_seed(seed)
model1 = model_func(**kwargs).initialize()
Y1 = model1.predict(get_X())
fix_random_seed(seed)
model2 = model_func(**kwargs).initialize()
Y2 = model2.predict(get_X())
if model1.has_ref("tok2vec"):
tok2vec1 = model1.get_ref("tok2vec").predict(get_X())
tok2vec2 = model2.get_ref("tok2vec").predict(get_X())
for i in range(len(tok2vec1)):
for j in range(len(tok2vec1[i])):
assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j]))
if isinstance(Y1, numpy.ndarray):
assert_array_equal(Y1, Y2)
elif isinstance(Y1, List):
assert len(Y1) == len(Y2)
for y1, y2 in zip(Y1, Y2):
assert_array_equal(y1, y2)
else:
raise ValueError(f"Could not compare type {type(Y1)}")
@pytest.mark.parametrize(
"seed,dropout,model_func,kwargs,get_X",
[
(0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
(0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs),
(0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
],
)
def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
def get_updated_model():
fix_random_seed(seed)
optimizer = Adam(0.001)
model = model_func(**kwargs).initialize()
initial_params = get_all_params(model)
set_dropout_rate(model, dropout)
for _ in range(5):
Y, get_dX = model.begin_update(get_X())
dY = get_gradient(model, Y)
_ = get_dX(dY)
model.finish_update(optimizer)
updated_params = get_all_params(model)
with pytest.raises(AssertionError):
assert_array_equal(initial_params, updated_params)
return model
model1 = get_updated_model()
model2 = get_updated_model()
assert_array_equal(get_all_params(model1), get_all_params(model2))

View File

@ -0,0 +1,31 @@
import pytest
from spacy.cli.project.util import validate_project_commands
from spacy.schemas import ProjectConfigSchema, validate
@pytest.mark.parametrize(
"config",
[
{"commands": [{"name": "a"}, {"name": "a"}]},
{"commands": [{"name": "a"}], "workflows": {"a": []}},
{"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
],
)
def test_project_config_validation1(config):
with pytest.raises(SystemExit):
validate_project_commands(config)
@pytest.mark.parametrize(
"config,n_errors",
[
({"commands": {"a": []}}, 1),
({"commands": [{"help": "..."}]}, 1),
({"commands": [{"name": "a", "extra": "b"}]}, 1),
({"commands": [{"extra": "b"}]}, 2),
({"commands": [{"name": "a", "deps": [123]}]}, 1),
],
)
def test_project_config_validation2(config, n_errors):
errors = validate(ProjectConfigSchema, config)
assert len(errors) == n_errors

View File

@ -803,7 +803,7 @@ cdef class Doc:
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs] for id_ in attrs]
if array.dtype != numpy.uint64: if array.dtype != numpy.uint64:
warnings.warn(Warnings.W101.format(type=array.dtype)) warnings.warn(Warnings.W028.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs: if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032) raise ValueError(Errors.E032)

View File

@ -20,7 +20,6 @@ import subprocess
from contextlib import contextmanager from contextlib import contextmanager
import tempfile import tempfile
import shutil import shutil
import hashlib
import shlex import shlex
try: try:
@ -449,6 +448,16 @@ def split_command(command: str) -> List[str]:
return shlex.split(command, posix=not is_windows) return shlex.split(command, posix=not is_windows)
def join_command(command: List[str]) -> str:
"""Join a command using shlex. shlex.join is only available for Python 3.8+,
so we're using a workaround here.
command (List[str]): The command to join.
RETURNS (str): The joined command
"""
return " ".join(shlex.quote(cmd) for cmd in command)
def run_command(command: Union[str, List[str]]) -> None: def run_command(command: Union[str, List[str]]) -> None:
"""Run a command on the command line as a subprocess. If the subprocess """Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed. returns a non-zero exit code, a system exit is performed.
@ -501,23 +510,13 @@ def make_tempdir():
warnings.warn(Warnings.W091.format(dir=d, msg=e)) warnings.warn(Warnings.W091.format(dir=d, msg=e))
def get_hash(data) -> str: def is_cwd(path: Union[Path, str]) -> bool:
"""Get the hash for a JSON-serializable object. """Check whether a path is the current working directory.
data: The data to hash. path (Union[Path, str]): The directory path.
RETURNS (str): The hash. RETURNS (bool): Whether the path is the current working directory.
""" """
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
return hashlib.md5(data_str).hexdigest()
def get_checksum(path: Union[Path, str]) -> str:
"""Get the checksum for a file given its file path.
path (Union[Path, str]): The file path.
RETURNS (str): The checksum.
"""
return hashlib.md5(Path(path).read_bytes()).hexdigest()
def is_in_jupyter(): def is_in_jupyter():
@ -722,6 +721,51 @@ def minibatch(items, size=8):
yield list(batch) yield list(batch)
def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False):
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
for outer_batch in minibatch(docs, buffer):
outer_batch = list(outer_batch)
target_size = next(size_)
for indices in _batch_by_length(outer_batch, target_size):
subbatch = [outer_batch[i] for i in indices]
padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
if discard_oversize and padded_size >= target_size:
pass
else:
yield subbatch
def _batch_by_length(seqs, max_words):
"""Given a list of sequences, return a batched list of indices into the
list, where the batches are grouped by length, in descending order.
Batches may be at most max_words in size, defined as max sequence length * size.
"""
# Use negative index so we can get sort by position ascending.
lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
lengths_indices.sort()
batches = []
batch = []
for length, i in lengths_indices:
if not batch:
batch.append(i)
elif length * (len(batch) + 1) <= max_words:
batch.append(i)
else:
batches.append(batch)
batch = [i]
if batch:
batches.append(batch)
# Check lengths match
assert sum(len(b) for b in batches) == len(seqs)
batches = [list(sorted(batch)) for batch in batches]
batches.reverse()
return batches
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
"""Create minibatches of roughly a given number of words. If any examples """Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by are longer than the specified batch length, they will appear in a batch by
@ -768,7 +812,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
# yield the previous batch and start a new one. The new one gets the overflow examples. # yield the previous batch and start a new one. The new one gets the overflow examples.
else: else:
yield batch if batch:
yield batch
target_size = next(size_) target_size = next(size_)
tol_size = target_size * tolerance tol_size = target_size * tolerance
batch = overflow batch = overflow
@ -788,15 +833,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
# this example does not fit with the previous overflow: start another new batch # this example does not fit with the previous overflow: start another new batch
else: else:
yield batch if batch:
yield batch
target_size = next(size_) target_size = next(size_)
tol_size = target_size * tolerance tol_size = target_size * tolerance
batch = [doc] batch = [doc]
batch_size = n_words batch_size = n_words
# yield the final batch batch.extend(overflow)
if batch: if batch:
batch.extend(overflow)
yield batch yield batch

View File

@ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library
source: spacy/ml/models source: spacy/ml/models
--- ---
TODO: write TODO: intro and how architectures work, link to
[`registry`](/api/top-level#registry),
[custom models](/usage/training#custom-models) usage etc.
## Parser architectures {source="spacy/ml/models/parser.py"}
### spacy.TransitionBasedParser.v1
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TransitionBasedParser.v1"
> nr_feature_tokens = 6
> hidden_width = 64
> maxout_pieces = 2
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| ------------------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nr_feature_tokens` | int | |
| `hidden_width` | int | |
| `maxout_pieces` | int | |
| `use_upper` | bool | |
| `nO` | int | |

View File

@ -297,60 +297,41 @@ will not be available.
## Train {#train} ## Train {#train}
<!-- TODO: document new training -->
Train a model. Expects data in spaCy's Train a model. Expects data in spaCy's
[JSON format](/api/data-formats#json-input). On each epoch, a model will be [binary format](/api/data-formats#training) and a
saved out to the directory. Accuracy scores and model details will be added to a [config file](/api/data-formats#config) with all settings and hyperparameters.
[`meta.json`](/usage/training#models-generating) to allow packaging the model Will save out the best model from all epochs, as well as the final model. The
using the [`package`](/api/cli#package) command. `--code` argument can be used to provide a Python file that's imported before
the training process starts. This lets you register
[custom functions](/usage/training#custom-models) and architectures and refer to
them in your config, all while still using spaCy's built-in `train` workflow. If
you need to manage complex multi-step training workflows, check out the new
[spaCy projects](/usage/projects).
<Infobox title="New in v3.0" variant="warning">
As of spaCy v3.0, the `train` command doesn't take a long list of command-line
arguments anymore and instead expects a single
[`config.cfg` file](/usage/training#config) containing all settings for the
pipeline, training process and hyperparameters.
</Infobox>
```bash ```bash
$ python -m spacy train [lang] [output_path] [train_path] [dev_path] $ python -m spacy train [train_path] [dev_path] [config_path] [--output]
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] [--code] [--verbose]
[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel]
[--textcat-positive-label] [--verbose]
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | positional | Model language. | | `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. | | `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | | `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | | `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | | `--verbose`, `-V` | flag | Show more detailed messages during training. |
| `--replace-components`, `-R` | flag | Replace components from the base model. | | `--help`, `-h` | flag | Show help message and available arguments. |
| `--vectors`, `-v` | option | Model to load vectors from. | | **CREATES** | model | The final model and the best model. |
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). |
| `--use-gpu`, `-g` | option | GPU ID or `-1` for CPU only (default: `-1`). |
| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. |
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
| `--width`, `-cw` <Tag variant="new">2.2.4</Tag> | option | Width of CNN layers of `Tok2Vec` component. |
| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag> | option | Depth of CNN layers of `Tok2Vec` component. |
| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag> | option | Window size for CNN layers of `Tok2Vec` component. |
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag> | option | Maxout size for CNN layers of `Tok2Vec` component. |
| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag> | flag | Whether to use character-based embedding of `Tok2Vec` component. |
| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag> | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). |
| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag> | option | Number of embedding rows of `Tok2Vec` component. |
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag> | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option | Location of JSON-formatted tag map. |
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | model, pickle | A spaCy model on each epoch. |
## Pretrain {#pretrain new="2.1" tag="experimental"} ## Pretrain {#pretrain new="2.1" tag="experimental"}
@ -471,20 +452,20 @@ as separate files if the respective component is present in the model's
pipeline. pipeline.
```bash ```bash
$ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] $ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
[--gpu-id] [--gold-preproc] [--return-scores] [--displacy-limit] [--gpu-id] [--gold-preproc]
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. | | `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
| `data_path` | positional | Location of JSON-formatted evaluation data. | | `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. | | `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | | `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. | | `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | | `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
| `--return-scores`, `-R` | flag | Return dict containing model scores. | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
| **CREATES** | `stdout`, HTML | Training results and optional displaCy visualizations. | | **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
## Package {#package} ## Package {#package}
@ -504,15 +485,17 @@ so you don't have to run `python setup.py sdist` separately anymore.
</Infobox> </Infobox>
```bash ```bash
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
[--version] [--force]
``` ```
```bash > #### Example
### Example >
python -m spacy package /input /output > ```bash
cd /output/en_model-0.0.0 > python -m spacy package /input /output
pip install dist/en_model-0.0.0.tar.gz > cd /output/en_model-0.0.0
``` > pip install dist/en_model-0.0.0.tar.gz
> ```
| Argument | Type | Description | | Argument | Type | Description |
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -525,18 +508,137 @@ pip install dist/en_model-0.0.0.tar.gz
| `--help`, `-h` | flag | Show help message and available arguments. | | `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | directory | A Python package containing the spaCy model. | | **CREATES** | directory | A Python package containing the spaCy model. |
## Project {#project} ## Project {#project new="3"}
<!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design --> The `spacy project` CLI includes subcommands for working with
[spaCy projects](/usage/projects), end-to-end workflows for building and
deploying custom spaCy models.
### project clone {#project-clone} ### project clone {#project-clone}
Clone a project template from a Git repository. Calls into `git` under the hood
and uses the sparse checkout feature, so you're only downloading what you need.
By default, spaCy's
[project templates repo](https://github.com/explosion/projects) is used, but you
can provide any other repo (public or private) that you have access to using the
`--repo` option.
<!-- TODO: update example once we've decided on repo structure -->
```bash
$ python -m spacy project clone [name] [dest] [--repo]
```
> #### Example
>
> ```bash
> $ python -m spacy project clone some_example
> ```
>
> Clone from custom repo:
>
> ```bash
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
> ```
| Argument | Type | Description |
| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
| `dest` | positional | Where to clone the project. Defaults to current working directory. |
| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). |
### project assets {#project-assets} ### project assets {#project-assets}
### project run-all {#project-run-all} Fetch project assets like datasets and pretrained weights. Assets are defined in
the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
`checksum` is provided, the file is only downloaded if no local file with the
same checksum exists and spaCy will show an error if the checksum of the
downloaded file doesn't match. If assets don't specify a `url` they're
considered "private" and you have to take care of putting them into the
destination directory yourself. If a local path is provided, the asset is copied
into the current project.
```bash
$ python -m spacy project assets [project_dir]
```
> #### Example
>
> ```bash
> $ python -m spacy project assets
> ```
| Argument | Type | Description |
| -------------- | ---------- | ----------------------------------------------------------------- |
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. |
### project run {#project-run} ### project run {#project-run}
### project init {#project-init} Run a named command or workflow defined in the
[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
all commands in the workflow are run, in order. If commands define
[dependencies or outputs](/usage/projects#deps-outputs), they will only be
re-run if state has changed. For example, if the input dataset changes, a
preprocessing command that depends on those files will be re-run.
### project update-dvc {#project-update-dvc} ```bash
$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
```
> #### Example
>
> ```bash
> $ python -m spacy project run train
> ```
| Argument | Type | Description |
| --------------- | ---------- | ----------------------------------------------------------------- |
| `subcommand` | positional | Name of the command or workflow to run. |
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
| `--dry`, `-D` | flag |  Perform a dry run and don't execute scripts. |
| `--help`, `-h` | flag | Show help message and available arguments. |
### project dvc {#project-dvc}
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
so you need to specify one workflow defined in the
[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the
first defined workflow is used. The DVC config will only be updated if the
`project.yml` changed. For details, see the
[DVC integration](/usage/projects#dvc) docs.
<Infobox variant="warning">
This command requires DVC to be installed and initialized in the project
directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
You'll also need to add the assets you want to track with
[`dvc add`](https://dvc.org/doc/command-reference/add).
</Infobox>
```bash
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
```
> #### Example
>
> ```bash
> git init
> dvc init
> python -m spacy project dvc all
> ```
| Argument | Type | Description |
| ----------------- | ---------- | --------------------------------------------------------------------------------- |
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
| `--force`, `-F` | flag | Force-updating config file. |
| `--verbose`, `-V` | flag |  Print more output generated by DVC. |
| `--help`, `-h` | flag | Show help message and available arguments. |

View File

@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to
follow — only to succumb themselves. In short, just say no to optimizing your follow — only to succumb themselves. In short, just say no to optimizing your
Python. If it's not fast enough the first time, just switch to Cython. Python. If it's not fast enough the first time, just switch to Cython.
<Infobox title="📖 Resources"> <Infobox title="Resources" emoji="📖">
- [Official Cython documentation](http://docs.cython.org/en/latest/) - [Official Cython documentation](http://docs.cython.org/en/latest/)
(cython.org) (cython.org)

View File

@ -2,7 +2,8 @@
title: Data formats title: Data formats
teaser: Details on spaCy's input and output data formats teaser: Details on spaCy's input and output data formats
menu: menu:
- ['Training data', 'training'] - ['Training Data', 'training']
- ['Training Config', 'config']
- ['Vocabulary', 'vocab'] - ['Vocabulary', 'vocab']
--- ---
@ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank:
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
``` ```
## Training config {#config new="3"}
Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the
[usage documentation](/usage/training#config).
<Infobox variant="warning">
The `@` syntax lets you refer to function names registered in the
[function registry](/api/top-level#registry). For example,
`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
will be passed into that function as arguments. Those arguments depend on the
registered function. See the [model architectures](/api/architectures) docs for
API details.
</Infobox>
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
## Lexical data for vocabulary {#vocab-jsonl new="2"} ## Lexical data for vocabulary {#vocab-jsonl new="2"}
To populate a model's vocabulary, you can use the To populate a model's vocabulary, you can use the

View File

@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
component is available in the [processing pipeline](/usage/processing-pipelines) component is available in the [processing pipeline](/usage/processing-pipelines)
via the ID `"parser"`. via the ID `"parser"`.
## DependencyParser.Model {#model tag="classmethod"} ## Default config {#config}
Initialize a model for the pipe. The model should implement the This is the default configuration used to initialize the model powering the
`thinc.neural.Model` API. Wrappers are under development for most major machine pipeline component. See the [model architectures](/api/architectures)
learning libraries. documentation for details on the architectures and their arguments and
hyperparameters. To learn more about how to customize the config and train
custom models, check out the [training config](/usage/training#config) docs.
| Name | Type | Description | ```python
| ----------- | ------ | ------------------------------------- | https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg
| `**kwargs` | - | Parameters for initializing the model | ```
| **RETURNS** | object | The initialized model. |
## DependencyParser.\_\_init\_\_ {#init tag="method"} ## DependencyParser.\_\_init\_\_ {#init tag="method"}
> #### Example
>
> ```python
> # Construction via create_pipe with default model
> parser = nlp.create_pipe("parser")
>
> # Construction via create_pipe with custom model
> config = {"model": {"@architectures": "my_parser"}}
> parser = nlp.create_pipe("parser", config)
>
> # Construction from class with custom model from file
> from spacy.pipeline import DependencyParser
> model = util.load_config("model.cfg", create_objects=True)["model"]
> parser = DependencyParser(nlp.vocab, model)
> ```
Create a new pipeline instance. In your application, you would normally use a Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.create_pipe`](/api/language#create_pipe). [`nlp.create_pipe`](/api/language#create_pipe).
> #### Example | Name | Type | Description |
> | ----------- | ------------------ | ------------------------------------------------------------------------------- |
> ```python | `vocab` | `Vocab` | The shared vocabulary. |
> # Construction via create_pipe | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
> parser = nlp.create_pipe("parser") | `**cfg` | - | Configuration parameters. |
> | **RETURNS** | `DependencyParser` | The newly constructed object. |
> # Construction from class
> from spacy.pipeline import DependencyParser
> parser = DependencyParser(nlp.vocab)
> parser.from_disk("/path/to/model")
> ```
| Name | Type | Description |
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `**cfg` | - | Configuration parameters. |
| **RETURNS** | `DependencyParser` | The newly constructed object. |
## DependencyParser.\_\_call\_\_ {#call tag="method"} ## DependencyParser.\_\_call\_\_ {#call tag="method"}
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | -------- | ------------------------------------------------------ | | ------------ | --------------- | ------------------------------------------------------ |
| `stream` | iterable | A stream of documents. | | `stream` | `Iterable[Doc]` | A stream of documents. |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | `Doc` | Processed documents in the order of the original text. |
## DependencyParser.predict {#predict tag="method"} ## DependencyParser.predict {#predict tag="method"}
@ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------- | ---------------------------------------------- | | ----------- | ------------------- | ---------------------------------------------- |
| `docs` | iterable | The documents to predict. | | `docs` | `Iterable[Doc]` | The documents to predict. |
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). | | **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
## DependencyParser.set_annotations {#set_annotations tag="method"} ## DependencyParser.set_annotations {#set_annotations tag="method"}
@ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores.
> parser.set_annotations([doc1, doc2], scores) > parser.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | ---------------------------------------------------------- | | -------- | ------------------- | ---------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | `Iterable[Doc]` | The documents to modify. |
| `scores` | - | The scores to set, produced by `DependencyParser.predict`. | | `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. |
## DependencyParser.update {#update tag="method"} ## DependencyParser.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and model. Delegates to [`predict`](/api/dependencyparser#predict) and
[`get_loss`](/api/dependencyparser#get_loss). [`get_loss`](/api/dependencyparser#get_loss).
> #### Example > #### Example
> >
> ```python > ```python
> parser = DependencyParser(nlp.vocab) > parser = DependencyParser(nlp.vocab, parser_model)
> losses = {}
> optimizer = nlp.begin_training() > optimizer = nlp.begin_training()
> parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) > losses = parser.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | -------------------------------------------------------------------------------------------- | | ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A batch of documents to learn from. | | `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | float | The dropout rate. |
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. | | `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | | `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## DependencyParser.get_loss {#get_loss tag="method"} ## DependencyParser.get_loss {#get_loss tag="method"}
@ -156,21 +162,20 @@ predicted scores.
> >
> ```python > ```python
> parser = DependencyParser(nlp.vocab) > parser = DependencyParser(nlp.vocab)
> scores = parser.predict([doc1, doc2]) > scores = parser.predict([eg.predicted for eg in examples])
> loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores) > loss, d_loss = parser.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------ | | ----------- | ------------------- | --------------------------------------------------- |
| `docs` | iterable | The batch of documents. | | `examples` | `Iterable[Example]` | The batch of examples. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
| `scores` | - | Scores representing the model's predictions. | | **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
## DependencyParser.begin_training {#begin_training tag="method"} ## DependencyParser.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. If no model Initialize the pipe for training, using data examples if available. Return an
has been initialized yet, the model is added. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example
> >
@ -180,16 +185,17 @@ has been initialized yet, the model is added.
> optimizer = parser.begin_training(pipeline=nlp.pipeline) > optimizer = parser.begin_training(pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | | `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
| `pipeline` | list | Optional list of pipeline components that this component is part of. | | `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. | | `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
| **RETURNS** | callable | An optimizer. | | **RETURNS** | `Optimizer` | An optimizer. |
## DependencyParser.create_optimizer {#create_optimizer tag="method"} ## DependencyParser.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
component.
> #### Example > #### Example
> >
@ -198,9 +204,9 @@ Create an optimizer for the pipeline component.
> optimizer = parser.create_optimizer() > optimizer = parser.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | -------------- | | ----------- | ----------- | --------------------------------------------------------------- |
| **RETURNS** | callable | The optimizer. | | **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
## DependencyParser.use_params {#use_params tag="method, contextmanager"} ## DependencyParser.use_params {#use_params tag="method, contextmanager"}

View File

@ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
component is available in the [processing pipeline](/usage/processing-pipelines) component is available in the [processing pipeline](/usage/processing-pipelines)
via the ID `"entity_linker"`. via the ID `"entity_linker"`.
## EntityLinker.Model {#model tag="classmethod"} ## Default config {#config}
Initialize a model for the pipe. The model should implement the This is the default configuration used to initialize the model powering the
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the pipeline component. See the [model architectures](/api/architectures)
context encoder. Wrappers are under development for most major machine learning documentation for details on the architectures and their arguments and
libraries. hyperparameters. To learn more about how to customize the config and train
custom models, check out the [training config](/usage/training#config) docs.
| Name | Type | Description | ```python
| ----------- | ------ | ------------------------------------- | https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg
| `**kwargs` | - | Parameters for initializing the model | ```
| **RETURNS** | object | The initialized model. |
## EntityLinker.\_\_init\_\_ {#init tag="method"} ## EntityLinker.\_\_init\_\_ {#init tag="method"}
> #### Example
>
> ```python
> # Construction via create_pipe with default model
> entity_linker = nlp.create_pipe("entity_linker")
>
> # Construction via create_pipe with custom model
> config = {"model": {"@architectures": "my_el"}}
> entity_linker = nlp.create_pipe("entity_linker", config)
>
> # Construction from class with custom model from file
> from spacy.pipeline import EntityLinker
> model = util.load_config("model.cfg", create_objects=True)["model"]
> entity_linker = EntityLinker(nlp.vocab, model)
> ```
Create a new pipeline instance. In your application, you would normally use a Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.create_pipe`](/api/language#create_pipe). [`nlp.create_pipe`](/api/language#create_pipe).
> #### Example | Name | Type | Description |
> | ------- | ------- | ------------------------------------------------------------------------------- |
> ```python | `vocab` | `Vocab` | The shared vocabulary. |
> # Construction via create_pipe | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
> entity_linker = nlp.create_pipe("entity_linker") | `**cfg` | - | Configuration parameters. |
>
> # Construction from class
> from spacy.pipeline import EntityLinker
> entity_linker = EntityLinker(nlp.vocab)
> entity_linker.from_disk("/path/to/model")
> ```
| Name | Type | Description | | **RETURNS** | `EntityLinker` | The newly constructed object. |
| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to `128`. |
| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to `True`. |
| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`. |
| **RETURNS** | `EntityLinker` | The newly constructed object. |
## EntityLinker.\_\_call\_\_ {#call tag="method"} ## EntityLinker.\_\_call\_\_ {#call tag="method"}
@ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | -------- | ------------------------------------------------------ | | ------------ | --------------- | ------------------------------------------------------ |
| `stream` | iterable | A stream of documents. | | `stream` | `Iterable[Doc]` | A stream of documents. |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | `Doc` | Processed documents in the order of the original text. |
## EntityLinker.predict {#predict tag="method"} ## EntityLinker.predict {#predict tag="method"}
@ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
> >
> ```python > ```python
> entity_linker = EntityLinker(nlp.vocab) > entity_linker = EntityLinker(nlp.vocab)
> kb_ids, tensors = entity_linker.predict([doc1, doc2]) > kb_ids = entity_linker.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | --------------- | ------------------------------------------------------------ |
| `docs` | iterable | The documents to predict. | | `docs` | `Iterable[Doc]` | The documents to predict. |
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | | **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. |
## EntityLinker.set_annotations {#set_annotations tag="method"} ## EntityLinker.set_annotations {#set_annotations tag="method"}
@ -122,19 +125,18 @@ entities.
> >
> ```python > ```python
> entity_linker = EntityLinker(nlp.vocab) > entity_linker = EntityLinker(nlp.vocab)
> kb_ids, tensors = entity_linker.predict([doc1, doc2]) > kb_ids = entity_linker.predict([doc1, doc2])
> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) > entity_linker.set_annotations([doc1, doc2], kb_ids)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | -------- | ------------------------------------------------------------------------------------------------- | | -------- | --------------- | ------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | `Iterable[Doc]` | The documents to modify. |
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | | `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
| `tensors` | iterable | The token representations used to predict the identifiers. |
## EntityLinker.update {#update tag="method"} ## EntityLinker.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating both the Learn from a batch of [`Example`](/api/example) objects, updating both the
pipe's entity linking model and context encoder. Delegates to pipe's entity linking model and context encoder. Delegates to
[`predict`](/api/entitylinker#predict) and [`predict`](/api/entitylinker#predict) and
[`get_loss`](/api/entitylinker#get_loss). [`get_loss`](/api/entitylinker#get_loss).
@ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to
> #### Example > #### Example
> >
> ```python > ```python
> entity_linker = EntityLinker(nlp.vocab) > entity_linker = EntityLinker(nlp.vocab, nel_model)
> losses = {}
> optimizer = nlp.begin_training() > optimizer = nlp.begin_training()
> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) > losses = entity_linker.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | ------------------------------------------------------------------------------------------------------- | | ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `docs` | iterable | A batch of documents to learn from. | | `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | _keyword-only_ | | |
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | | `drop` | float | The dropout rate. |
| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | | `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). |
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | | `sgd` | `Optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
## EntityLinker.get_loss {#get_loss tag="method"} | **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
Find the loss and gradient of loss for the entities in a batch of documents and
their predicted scores.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> kb_ids, tensors = entity_linker.predict(docs)
> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
> ```
| Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------ |
| `docs` | iterable | The batch of documents. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
| `tensors` | iterable | The token representations used to predict the identifiers |
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
## EntityLinker.set_kb {#set_kb tag="method"} ## EntityLinker.set_kb {#set_kb tag="method"}
@ -195,9 +177,9 @@ identifiers.
## EntityLinker.begin_training {#begin_training tag="method"} ## EntityLinker.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. If no model Initialize the pipe for training, using data examples if available. Return an
has been initialized yet, the model is added. Before calling this method, a [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
knowledge base should have been defined with method, a knowledge base should have been defined with
[`set_kb`](/api/entitylinker#set_kb). [`set_kb`](/api/entitylinker#set_kb).
> #### Example > #### Example
@ -209,12 +191,12 @@ knowledge base should have been defined with
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | | `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
| `pipeline` | list | Optional list of pipeline components that this component is part of. | | `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | | `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. |
| **RETURNS** | callable | An optimizer. | | **RETURNS** | `Optimizer` | An optimizer. | |
## EntityLinker.create_optimizer {#create_optimizer tag="method"} ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
@ -227,9 +209,9 @@ Create an optimizer for the pipeline component.
> optimizer = entity_linker.create_optimizer() > optimizer = entity_linker.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | -------------- | | ----------- | ----------- | --------------------------------------------------------------- |
| **RETURNS** | callable | The optimizer. | | **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
## EntityLinker.use_params {#use_params tag="method, contextmanager"} ## EntityLinker.use_params {#use_params tag="method, contextmanager"}

View File

@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
component is available in the [processing pipeline](/usage/processing-pipelines) component is available in the [processing pipeline](/usage/processing-pipelines)
via the ID `"ner"`. via the ID `"ner"`.
## EntityRecognizer.Model {#model tag="classmethod"} ## Default config {#config}
Initialize a model for the pipe. The model should implement the This is the default configuration used to initialize the model powering the
`thinc.neural.Model` API. Wrappers are under development for most major machine pipeline component. See the [model architectures](/api/architectures)
learning libraries. documentation for details on the architectures and their arguments and
hyperparameters. To learn more about how to customize the config and train
custom models, check out the [training config](/usage/training#config) docs.
| Name | Type | Description | ```python
| ----------- | ------ | ------------------------------------- | https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg
| `**kwargs` | - | Parameters for initializing the model | ```
| **RETURNS** | object | The initialized model. |
## EntityRecognizer.\_\_init\_\_ {#init tag="method"} ## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.create_pipe`](/api/language#create_pipe).
> #### Example > #### Example
> >
> ```python > ```python
> # Construction via create_pipe > # Construction via create_pipe
> ner = nlp.create_pipe("ner") > ner = nlp.create_pipe("ner")
> >
> # Construction from class > # Construction via create_pipe with custom model
> config = {"model": {"@architectures": "my_ner"}}
> parser = nlp.create_pipe("ner", config)
>
> # Construction from class with custom model from file
> from spacy.pipeline import EntityRecognizer > from spacy.pipeline import EntityRecognizer
> ner = EntityRecognizer(nlp.vocab) > model = util.load_config("model.cfg", create_objects=True)["model"]
> ner.from_disk("/path/to/model") > ner = EntityRecognizer(nlp.vocab, model)
> ``` > ```
| Name | Type | Description | Create a new pipeline instance. In your application, you would normally use a
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | shortcut for this and instantiate the component using its string name and
| `vocab` | `Vocab` | The shared vocabulary. | [`nlp.create_pipe`](/api/language#create_pipe).
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `**cfg` | - | Configuration parameters. | | Name | Type | Description |
| **RETURNS** | `EntityRecognizer` | The newly constructed object. | | ----------- | ------------------ | ------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `**cfg` | - | Configuration parameters. |
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"} ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | -------- | ------------------------------------------------------ | | ------------ | --------------- | ------------------------------------------------------ |
| `stream` | iterable | A stream of documents. | | `stream` | `Iterable[Doc]` | A stream of documents. |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | `Doc` | Processed documents in the order of the original text. |
## EntityRecognizer.predict {#predict tag="method"} ## EntityRecognizer.predict {#predict tag="method"}
@ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> scores, tensors = ner.predict([doc1, doc2]) > scores = ner.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to predict. | | `docs` | `Iterable[Doc]` | The documents to predict. |
| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | | **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
## EntityRecognizer.set_annotations {#set_annotations tag="method"} ## EntityRecognizer.set_annotations {#set_annotations tag="method"}
@ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores.
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> scores, tensors = ner.predict([doc1, doc2]) > scores = ner.predict([doc1, doc2])
> ner.set_annotations([doc1, doc2], scores, tensors) > ner.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | -------- | ---------------------------------------------------------- | | -------- | ------------------ | ---------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | `Iterable[Doc]` | The documents to modify. |
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | | `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. |
| `tensors` | iterable | The token representations used to predict the scores. |
## EntityRecognizer.update {#update tag="method"} ## EntityRecognizer.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and model. Delegates to [`predict`](/api/entityrecognizer#predict) and
[`get_loss`](/api/entityrecognizer#get_loss). [`get_loss`](/api/entityrecognizer#get_loss).
> #### Example > #### Example
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab, ner_model)
> losses = {}
> optimizer = nlp.begin_training() > optimizer = nlp.begin_training()
> ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) > losses = ner.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | -------------------------------------------------------------------------------------------- | | ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A batch of documents to learn from. | | `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | float | The dropout rate. |
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. | | `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | | `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## EntityRecognizer.get_loss {#get_loss tag="method"} ## EntityRecognizer.get_loss {#get_loss tag="method"}
@ -157,21 +162,20 @@ predicted scores.
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> scores = ner.predict([doc1, doc2]) > scores = ner.predict([eg.predicted for eg in examples])
> loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores) > loss, d_loss = ner.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------ | | ----------- | ------------------- | --------------------------------------------------- |
| `docs` | iterable | The batch of documents. | | `examples` | `Iterable[Example]` | The batch of examples. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | `scores` | `List[StateClass]` | Scores representing the model's predictions. |
| `scores` | - | Scores representing the model's predictions. | | **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
## EntityRecognizer.begin_training {#begin_training tag="method"} ## EntityRecognizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. If no model Initialize the pipe for training, using data examples if available. Return an
has been initialized yet, the model is added. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example
> >
@ -181,12 +185,12 @@ has been initialized yet, the model is added.
> optimizer = ner.begin_training(pipeline=nlp.pipeline) > optimizer = ner.begin_training(pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | | `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
| `pipeline` | list | Optional list of pipeline components that this component is part of. | | `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. | | `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
| **RETURNS** | callable | An optimizer. | | **RETURNS** | `Optimizer` | An optimizer. |
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
@ -199,9 +203,9 @@ Create an optimizer for the pipeline component.
> optimizer = ner.create_optimizer() > optimizer = ner.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | -------------- | | ----------- | ----------- | --------------------------------------------------------------- |
| **RETURNS** | callable | The optimizer. | | **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"} ## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}

Some files were not shown because too many files have changed in this diff Show More