Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthw Honnibal 2020-07-08 15:27:50 +02:00
commit fb8a5967c1
11 changed files with 330 additions and 191 deletions

View File

@ -4,5 +4,4 @@ __version__ = "3.0.0a2"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
__projects__ = "https://github.com/explosion/spacy-boilerplates" __projects__ = "https://github.com/explosion/spacy-boilerplates"

View File

@ -1,4 +1,4 @@
from typing import Optional, Sequence, Union from typing import Optional, Sequence
import requests import requests
import sys import sys
from wasabi import msg from wasabi import msg
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
from .. import about from .. import about
from ..util import is_package, get_base_version, run_command from ..util import is_package, get_base_version, run_command
# These are the old shortcuts we previously supported in spacy download. As of
# v3, shortcuts are deprecated so we're not expecting to add anything to this
# list. It only exists to show users warnings.
OLD_SHORTCUTS = {
"en": "en_core_web_sm",
"de": "de_core_news_sm",
"es": "es_core_news_sm",
"pt": "pt_core_news_sm",
"fr": "fr_core_news_sm",
"it": "it_core_news_sm",
"nl": "nl_core_news_sm",
"el": "el_core_news_sm",
"nb": "nb_core_news_sm",
"lt": "lt_core_news_sm",
"xx": "xx_ent_wiki_sm",
}
@app.command( @app.command(
"download", "download",
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
version = components[-1] version = components[-1]
download_model(dl_tpl.format(m=model_name, v=version), pip_args) download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else: else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = model
model_name = shortcuts.get(model, model) if model in OLD_SHORTCUTS:
msg.warn(
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
)
model_name = OLD_SHORTCUTS[model]
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model(dl_tpl.format(m=model_name, v=version), pip_args) download_model(dl_tpl.format(m=model_name, v=version), pip_args)
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
) )
def get_json(url: str, desc: str) -> Union[dict, list]: def get_compatibility() -> dict:
r = requests.get(url) version = get_base_version(about.__version__)
r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:
msg.fail( msg.fail(
f"Server error ({r.status_code})", f"Server error ({r.status_code})",
f"Couldn't fetch {desc}. Please find a model for your spaCy " f"Couldn't fetch compatibility table. Please find a model for your spaCy "
f"installation (v{about.__version__}), and download it manually. " f"installation (v{about.__version__}), and download it manually. "
f"For more details, see the documentation: " f"For more details, see the documentation: "
f"https://spacy.io/usage/models", f"https://spacy.io/usage/models",
exits=1, exits=1,
) )
return r.json() comp_table = r.json()
def get_compatibility() -> dict:
version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"] comp = comp_table["spacy"]
if version not in comp: if version not in comp:
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)

View File

@ -1,4 +1,4 @@
from typing import List, Dict, Any, Optional, Sequence from typing import List, Dict, Any, Optional, Sequence, Union
import typer import typer
import srsly import srsly
from pathlib import Path from pathlib import Path
@ -18,7 +18,7 @@ from ..util import ensure_path, run_command, make_tempdir, working_dir
from ..util import get_hash, get_checksum, split_command from ..util import get_hash, get_checksum, split_command
CONFIG_FILE = "project.yml" PROJECT_FILE = "project.yml"
DVC_CONFIG = "dvc.yaml" DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc" DVC_DIR = ".dvc"
DIRS = [ DIRS = [
@ -38,12 +38,12 @@ CACHES = [
os.environ.get("TORCH_HOME"), os.environ.get("TORCH_HOME"),
Path.home() / ".keras", Path.home() / ".keras",
] ]
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit
# it directly and edit the project.yml instead and re-run the project.""" # it directly and edit the {PROJECT_FILE} instead and re-run the project."""
CLI_HELP = f"""Command-line interface for spaCy projects and working with project CLI_HELP = f"""Command-line interface for spaCy projects and working with project
templates. You'd typically start by cloning a project template to a local templates. You'd typically start by cloning a project template to a local
directory and fetching its assets like datasets etc. See the project's directory and fetching its assets like datasets etc. See the project's
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data {PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
Version Control) to manage input and output files and to ensure steps are only Version Control) to manage input and output files and to ensure steps are only
re-run if their inputs change. re-run if their inputs change.
""" """
@ -91,7 +91,7 @@ def project_init_cli(
# fmt: off # fmt: off
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
force: bool = Opt(False, "--force", "-F", help="Force initiziation"), force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"),
# fmt: on # fmt: on
): ):
"""Initialize a project directory with DVC and optionally Git. This should """Initialize a project directory with DVC and optionally Git. This should
@ -100,7 +100,7 @@ def project_init_cli(
be a Git repo, it should be initialized with Git first, before initializing be a Git repo, it should be initialized with Git first, before initializing
DVC. This allows DVC to integrate with Git. DVC. This allows DVC to integrate with Git.
""" """
project_init(path, git=git, force=force, silent=True) project_init(path, git=git, force=force)
@project_cli.command("assets") @project_cli.command("assets")
@ -110,11 +110,11 @@ def project_assets_cli(
# fmt: on # fmt: on
): ):
"""Use DVC (Data Version Control) to fetch project assets. Assets are """Use DVC (Data Version Control) to fetch project assets. Assets are
defined in the "assets" section of the project config. If possible, DVC defined in the "assets" section of the project.yml. If possible, DVC
will try to track the files so you can pull changes from upstream. It will will try to track the files so you can pull changes from upstream. It will
also try and store the checksum so the assets are versioned. If the file also try and store the checksum so the assets are versioned. If the file
can't be tracked or checked, it will be downloaded without DVC. If a checksum can't be tracked or checked, it will be downloaded without DVC. If a checksum
is provided in the project config, the file is only downloaded if no local is provided in the project.yml, the file is only downloaded if no local
file with the same checksum exists. file with the same checksum exists.
""" """
project_assets(project_dir) project_assets(project_dir)
@ -132,7 +132,7 @@ def project_run_all_cli(
# fmt: on # fmt: on
): ):
"""Run all commands defined in the project. This command will use DVC and """Run all commands defined in the project. This command will use DVC and
the defined outputs and dependencies in the project config to determine the defined outputs and dependencies in the project.yml to determine
which steps need to be re-run and where to start. This means you're only which steps need to be re-run and where to start. This means you're only
re-generating data if the inputs have changed. re-generating data if the inputs have changed.
@ -151,12 +151,12 @@ def project_run_all_cli(
def project_run_cli( def project_run_cli(
# fmt: off # fmt: off
ctx: typer.Context, ctx: typer.Context,
subcommand: str = Arg(None, help="Name of command defined in project config"), subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on # fmt: on
): ):
"""Run a named script defined in the project config. If the command is """Run a named script defined in the project.yml. If the command is
part of the default pipeline defined in the "run" section, DVC is used to part of the default pipeline defined in the "run" section, DVC is used to
determine whether the step should re-run if its inputs have changed, or determine whether the step should re-run if its inputs have changed, or
whether everything is up to date. If the script is not part of the default whether everything is up to date. If the script is not part of the default
@ -175,13 +175,13 @@ def project_run_cli(
@project_cli.command("exec", hidden=True) @project_cli.command("exec", hidden=True)
def project_exec_cli( def project_exec_cli(
# fmt: off # fmt: off
subcommand: str = Arg(..., help="Name of command defined in project config"), subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on # fmt: on
): ):
"""Execute a command defined in the project config. This CLI command is """Execute a command defined in the project.yml. This CLI command is
only called internally in auto-generated DVC pipelines, as a shortcut for only called internally in auto-generated DVC pipelines, as a shortcut for
multi-step commands in the project config. You typically shouldn't have to multi-step commands in the project.yml. You typically shouldn't have to
call it yourself. To run a command, call "run" or "run-all". call it yourself. To run a command, call "run" or "run-all".
""" """
project_exec(project_dir, subcommand) project_exec(project_dir, subcommand)
@ -196,15 +196,15 @@ def project_update_dvc_cli(
# fmt: on # fmt: on
): ):
"""Update the auto-generated DVC config file. Uses the steps defined in the """Update the auto-generated DVC config file. Uses the steps defined in the
"run" section of the project config. This typically happens automatically "run" section of the project.yml. This typically happens automatically
when running a command, but can also be triggered manually if needed. when running a command, but can also be triggered manually if needed.
""" """
config = load_project_config(project_dir) config = load_project_config(project_dir)
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
if updated: if updated:
msg.good(f"Updated DVC config from {CONFIG_FILE}") msg.good(f"Updated DVC config from {PROJECT_FILE}")
else: else:
msg.info(f"No changes found in {CONFIG_FILE}, no update needed") msg.info(f"No changes found in {PROJECT_FILE}, no update needed")
app.add_typer(project_cli, name="project") app.add_typer(project_cli, name="project")
@ -241,7 +241,7 @@ def project_clone(
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
try: try:
run_command(cmd) run_command(cmd)
except SystemExit: except DVCError:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
msg.fail(err) msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
@ -249,7 +249,7 @@ def project_clone(
try: try:
run_command(["git", "-C", str(tmp_dir), "fetch"]) run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"]) run_command(["git", "-C", str(tmp_dir), "checkout"])
except SystemExit: except DVCError:
err = f"Could not clone '{name}' in the repo '{repo}'." err = f"Could not clone '{name}' in the repo '{repo}'."
msg.fail(err) msg.fail(err)
shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
@ -282,27 +282,29 @@ def project_init(
with working_dir(project_dir) as cwd: with working_dir(project_dir) as cwd:
if git: if git:
run_command(["git", "init"]) run_command(["git", "init"])
init_cmd = ["dvc", "init"] flags = {"--force": force, "--quiet": silent, "--no-scm": not git}
if silent: try:
init_cmd.append("--quiet") run_dvc_command(["init"], flags=flags)
if not git: except DVCError:
init_cmd.append("--no-scm") msg.fail(
if force: "Failed to initialize project. This likely means that the "
init_cmd.append("--force") "project is already initialized and has a .dvc directory. "
run_command(init_cmd) "To force-initialize, use the --force flag.",
exits=1,
)
# We don't want to have analytics on by default our users should # We don't want to have analytics on by default our users should
# opt-in explicitly. If they want it, they can always enable it. # opt-in explicitly. If they want it, they can always enable it.
if not analytics: if not analytics:
run_command(["dvc", "config", "core.analytics", "false"]) run_dvc_command(["config", "core.analytics", "false"])
# Remove unused and confusing plot templates from .dvc directory # Remove unused and confusing plot templates from .dvc directory.
# TODO: maybe we shouldn't do this, but it's otherwise super confusing # Otherwise super confusing once you commit your changes via Git and it
# once you commit your changes via Git and it creates a bunch of files # creates a bunch of files that have no purpose.
# that have no purpose
plots_dir = cwd / DVC_DIR / "plots" plots_dir = cwd / DVC_DIR / "plots"
if plots_dir.exists(): if plots_dir.exists():
shutil.rmtree(str(plots_dir)) shutil.rmtree(str(plots_dir))
config = load_project_config(cwd) config = load_project_config(cwd)
setup_check_dvc(cwd, config) setup_check_dvc(cwd, config)
msg.good("Initialized project")
def project_assets(project_dir: Path) -> None: def project_assets(project_dir: Path) -> None:
@ -315,19 +317,33 @@ def project_assets(project_dir: Path) -> None:
setup_check_dvc(project_path, config) setup_check_dvc(project_path, config)
assets = config.get("assets", {}) assets = config.get("assets", {})
if not assets: if not assets:
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)") msg.info(f"Fetching {len(assets)} asset(s)")
variables = config.get("variables", {}) variables = config.get("variables", {})
fetched_assets = [] fetched_assets = []
for asset in assets: for asset in assets:
url = asset["url"].format(**variables)
dest = asset["dest"].format(**variables) dest = asset["dest"].format(**variables)
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) url = asset.get("url")
checksum = asset.get("checksum")
if not url:
# project.yml defines asset without URL that the user has to place
if not Path(dest).exists():
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
msg.warn(err)
else:
if checksum == get_checksum(dest):
msg.good(f"Asset exists with matching checksum: {dest}")
fetched_assets.append((project_path / dest).resolve())
else:
msg.fail(f"Asset available but with incorrect checksum: {dest}")
continue
url = url.format(**variables)
fetched_path = fetch_asset(project_path, url, dest, checksum)
if fetched_path: if fetched_path:
fetched_assets.append(str(fetched_path)) fetched_assets.append(str(fetched_path))
if fetched_assets: if fetched_assets:
with working_dir(project_path): with working_dir(project_path):
run_command(["dvc", "add", *fetched_assets, "--external"]) run_dvc_command(["add", *fetched_assets, "--external"])
def fetch_asset( def fetch_asset(
@ -359,19 +375,17 @@ def fetch_asset(
# Try with tracking the source first, then just downloading with # Try with tracking the source first, then just downloading with
# DVC, then a regular non-DVC download. # DVC, then a regular non-DVC download.
try: try:
dvc_cmd = ["dvc", "import-url", url, str(dest_path)] run_dvc_command(["import-url", url, str(dest_path)])
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) except DVCError:
except subprocess.CalledProcessError: run_dvc_command(["get-url", url, str(dest_path)])
dvc_cmd = ["dvc", "get-url", url, str(dest_path)] except DVCError:
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
except subprocess.CalledProcessError:
try: try:
download_file(url, dest_path) download_file(url, dest_path)
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as e:
msg.fail(f"Download failed: {dest}", e) msg.fail(f"Download failed: {dest}", e)
return None return None
if checksum and checksum != get_checksum(dest_path): if checksum and checksum != get_checksum(dest_path):
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
msg.good(f"Fetched asset {dest}") msg.good(f"Fetched asset {dest}")
return dest_path return dest_path
@ -384,13 +398,17 @@ def project_run_all(project_dir: Path, *dvc_args) -> None:
""" """
config = load_project_config(project_dir) config = load_project_config(project_dir)
setup_check_dvc(project_dir, config) setup_check_dvc(project_dir, config)
dvc_cmd = ["dvc", "repro", *dvc_args]
with working_dir(project_dir): with working_dir(project_dir):
run_command(dvc_cmd) try:
run_dvc_command(["repro", *dvc_args])
except DVCError:
# We could raise a custom error here, but the output produced by
# DVC is already pretty substantial.
sys.exit(1)
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
"""Simulate a CLI help prompt using the info available in the project config. """Simulate a CLI help prompt using the info available in the project.yml.
project_dir (Path): The project directory. project_dir (Path): The project directory.
subcommand (Optional[str]): The subcommand or None. If a subcommand is subcommand (Optional[str]): The subcommand or None. If a subcommand is
@ -408,15 +426,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
if help_text: if help_text:
msg.text(f"\n{help_text}\n") msg.text(f"\n{help_text}\n")
else: else:
print(f"\nAvailable commands in {CONFIG_FILE}") print(f"\nAvailable commands in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
msg.text("Run all commands defined in the 'run' block of the project config:") msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
print(f"{COMMAND} project run-all {project_dir}") print(f"{COMMAND} project run-all {project_dir}")
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
"""Run a named script defined in the project config. If the script is part """Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to of the default pipeline (defined in the "run" section), DVC is used to
execute the command, so it can determine whether to rerun it. It then execute the command, so it can determine whether to rerun it. It then
calls into "exec" to execute it. calls into "exec" to execute it.
@ -433,9 +451,13 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
validate_subcommand(commands.keys(), subcommand) validate_subcommand(commands.keys(), subcommand)
if subcommand in config.get("run", []): if subcommand in config.get("run", []):
# This is one of the pipeline commands tracked in DVC # This is one of the pipeline commands tracked in DVC
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
with working_dir(project_dir): with working_dir(project_dir):
run_command(dvc_cmd) try:
run_dvc_command(["repro", subcommand, *dvc_args])
except DVCError:
# We could raise a custom error here, but the output produced by
# DVC is already pretty substantial.
sys.exit(1)
else: else:
cmd = commands[subcommand] cmd = commands[subcommand]
# Deps in non-DVC commands aren't tracked, but if they're defined, # Deps in non-DVC commands aren't tracked, but if they're defined,
@ -448,8 +470,8 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
run_commands(cmd["script"], variables) run_commands(cmd["script"], variables)
def project_exec(project_dir: Path, subcommand: str): def project_exec(project_dir: Path, subcommand: str) -> None:
"""Execute a command defined in the project config. """Execute a command defined in the project.yml.
project_dir (Path): Path to project directory. project_dir (Path): Path to project directory.
subcommand (str): Name of command to run. subcommand (str): Name of command to run.
@ -468,15 +490,15 @@ def project_exec(project_dir: Path, subcommand: str):
def load_project_config(path: Path) -> Dict[str, Any]: def load_project_config(path: Path) -> Dict[str, Any]:
"""Load the project config file from a directory and validate it. """Load the project.yml file from a directory and validate it.
path (Path): The path to the project directory. path (Path): The path to the project directory.
RETURNS (Dict[str, Any]): The loaded project config. RETURNS (Dict[str, Any]): The loaded project.yml.
""" """
config_path = path / CONFIG_FILE config_path = path / PROJECT_FILE
if not config_path.exists(): if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1) msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
invalid_err = f"Invalid project config in {CONFIG_FILE}" invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
try: try:
config = srsly.read_yaml(config_path) config = srsly.read_yaml(config_path)
except ValueError as e: except ValueError as e:
@ -500,7 +522,7 @@ def update_dvc_config(
dict, so if any of the config values change, the DVC config is regenerated. dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory. path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project config. config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC). verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC). silent (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match. force (bool): Force update, even if hashes match.
@ -514,10 +536,10 @@ def update_dvc_config(
with dvc_config_path.open("r", encoding="utf8") as f: with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "") ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force: if ref_hash == config_hash and not force:
return False # Nothing has changed in project config, don't need to update return False # Nothing has changed in project.yml, don't need to update
dvc_config_path.unlink() dvc_config_path.unlink()
variables = config.get("variables", {}) variables = config.get("variables", {})
commands = [] dvc_commands = []
# We only want to include commands that are part of the main list of "run" # We only want to include commands that are part of the main list of "run"
# commands in project.yml and should be run in sequence # commands in project.yml and should be run in sequence
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
@ -535,15 +557,12 @@ def update_dvc_config(
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
if verbose:
dvc_cmd.append("--verbose")
if silent:
dvc_cmd.append("--quiet")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
commands.append(" ".join(full_cmd)) dvc_commands.append(" ".join(full_cmd))
with working_dir(path): with working_dir(path):
run_commands(commands, variables, silent=True) dvc_flags = {"--verbose": verbose, "--quiet": silent}
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
with dvc_config_path.open("r+", encoding="utf8") as f: with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read() content = f.read()
f.seek(0, 0) f.seek(0, 0)
@ -571,7 +590,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
DVC project. DVC project.
project_dir (Path): The path to the project directory. project_dir (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project config. config (Dict[str, Any]): The loaded project.yml.
""" """
if not project_dir.exists(): if not project_dir.exists():
msg.fail(f"Can't find project directory: {project_dir}") msg.fail(f"Can't find project directory: {project_dir}")
@ -586,38 +605,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
with msg.loading("Updating DVC config..."): with msg.loading("Updating DVC config..."):
updated = update_dvc_config(project_dir, config, silent=True) updated = update_dvc_config(project_dir, config, silent=True)
if updated: if updated:
msg.good(f"Updated DVC config from changed {CONFIG_FILE}") msg.good(f"Updated DVC config from changed {PROJECT_FILE}")
def run_commands(
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The string commands.
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (bool): Don't print the commands.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]]
if not silent:
print(f"Running command: {' '.join(command)}")
run_command(command)
def convert_asset_url(url: str) -> str: def convert_asset_url(url: str) -> str:
@ -627,7 +615,7 @@ def convert_asset_url(url: str) -> str:
RETURNS (str): The converted URL. RETURNS (str): The converted URL.
""" """
# If the asset URL is a regular GitHub URL it's likely a mistake # If the asset URL is a regular GitHub URL it's likely a mistake
if re.match("(http(s?)):\/\/github.com", url): if re.match(r"(http(s?)):\/\/github.com", url):
converted = url.replace("github.com", "raw.githubusercontent.com") converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted) converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn( msg.warn(
@ -679,7 +667,7 @@ def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
""" """
if subcommand not in commands: if subcommand not in commands:
msg.fail( msg.fail(
f"Can't find command '{subcommand}' in {CONFIG_FILE}. " f"Can't find command '{subcommand}' in {PROJECT_FILE}. "
f"Available commands: {', '.join(commands)}", f"Available commands: {', '.join(commands)}",
exits=1, exits=1,
) )
@ -706,3 +694,112 @@ def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
for data in response.iter_content(chunk_size=chunk_size): for data in response.iter_content(chunk_size=chunk_size):
size = f.write(data) size = f.write(data)
bar.update(size) bar.update(size)
def run_commands(
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The string commands.
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (bool): Don't print the commands.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]]
if not silent:
print(f"Running command: {' '.join(command)}")
run_command(command)
def run_dvc_commands(
commands: List[str] = tuple(),
variables: Dict[str, str] = {},
flags: Dict[str, bool] = {},
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
run_dvc_command(command, flags=flags)
def run_dvc_command(
command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False
) -> None:
"""Run a DVC command in a subprocess. This wrapper gives us a bit more
control over how the output and errors are presented. Raises a DVC error if
the "dvc" command returns a non-zero exit code and uses the error message
logged by DVC.
command (Union[str, List[str]]): The command, without the leading "dvc".
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
silent (bool): Don't print any output.
"""
if isinstance(command, str):
command = split_command(command)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if not silent:
lines = proc.stdout.read().decode("utf8").split("\n\n")
for line in lines:
line = line.strip()
if is_relevant_dvc_output(line):
print(f"{line}\n")
_, err = proc.communicate() # Important: otherwise returncode will be None!
if proc.returncode != 0:
if isinstance(err, bytes):
err = err.decode("utf8")
raise DVCError(err)
def is_relevant_dvc_output(line: str) -> bool:
"""Check whether the output by DVC is something we want to keep.
line (str): A line written to stdout,.
RETURNS (bool): Whether to use/print the line.
"""
# Writing them like this for readability but maybe replace with regex?
conditions = [
not line,
line.startswith("What's next?"),
line.startswith("Having any troubles?"),
]
return not any(conditions)
class DVCError(RuntimeError):
"""Custom error type for anything produced by the DVC CLI."""
pass

View File

@ -477,15 +477,14 @@ class Errors(object):
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
"array and {doc_length} for the Doc itself.") "array and {doc_length} for the Doc itself.")
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
E973 = ("Unexpected type for NER data") E973 = ("Unexpected type for NER data")
E974 = ("Unknown {obj} attribute: {key}") E974 = ("Unknown {obj} attribute: {key}")
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
"but got {type}")
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
"but received None.") "but received None.")
E977 = ("Can not compare a MorphAnalysis with a string object. " E977 = ("Can not compare a MorphAnalysis with a string object. "
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")

View File

@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
cdef class Example: cdef class Example:
def __init__(self, Doc predicted, Doc reference, *, alignment=None): def __init__(self, Doc predicted, Doc reference, *, alignment=None):
""" Doc can either be text, or an actual Doc """
if predicted is None: if predicted is None:
raise TypeError(Errors.E972.format(arg="predicted")) raise TypeError(Errors.E972.format(arg="predicted"))
if reference is None: if reference is None:
@ -59,17 +58,15 @@ cdef class Example:
@classmethod @classmethod
def from_dict(cls, Doc predicted, dict example_dict): def from_dict(cls, Doc predicted, dict example_dict):
if predicted is None:
raise ValueError(Errors.E976.format(n="first", type="Doc"))
if example_dict is None: if example_dict is None:
raise ValueError(Errors.E976) raise ValueError(Errors.E976.format(n="second", type="dict"))
if not isinstance(predicted, Doc):
raise TypeError(Errors.E975.format(type=type(predicted)))
example_dict = _fix_legacy_dict_data(example_dict) example_dict = _fix_legacy_dict_data(example_dict)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
if not _has_field(tok_dict, "SPACY"):
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([vocab.morphology.add(v) for v in value]) values.append([vocab.morphology.add(v) for v in value])
else: else:
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(v) for v in value]) try:
values.append([vocab.strings.add(v) for v in value])
except TypeError:
types= set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types))
array = numpy.asarray(values, dtype="uint64") array = numpy.asarray(values, dtype="uint64")
return attrs, array.T return attrs, array.T

View File

@ -272,7 +272,7 @@ cdef class Morphology:
@staticmethod @staticmethod
def feats_to_dict(feats): def feats_to_dict(feats):
if not feats: if not feats or feats == Morphology.EMPTY_MORPH:
return {} return {}
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}

View File

@ -3,7 +3,7 @@ cimport numpy as np
import numpy import numpy
import srsly import srsly
from thinc.api import to_categorical from thinc.api import SequenceCategoricalCrossentropy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
doc.is_morphed = True doc.is_morphed = True
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores) loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
tag_index = {tag: i for i, tag in enumerate(self.labels)} truths = []
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for eg in examples: for eg in examples:
eg_truths = []
pos_tags = eg.get_aligned("POS", as_string=True) pos_tags = eg.get_aligned("POS", as_string=True)
morphs = eg.get_aligned("MORPH", as_string=True) morphs = eg.get_aligned("MORPH", as_string=True)
for i in range(len(morphs)): for i in range(len(morphs)):
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
morph = self.vocab.strings[self.vocab.morphology.add(feats)] morph = self.vocab.strings[self.vocab.morphology.add(feats)]
if morph == "": if morph == "":
morph = Morphology.EMPTY_MORPH morph = Morphology.EMPTY_MORPH
if morph is None: eg_truths.append(morph)
correct[idx] = guesses[idx] truths.append(eg_truths)
elif morph in tag_index: d_scores, loss = loss_func(scores, truths)
correct[idx] = tag_index[morph] if self.model.ops.xp.isnan(loss):
else: raise ValueError("nan value when computing loss")
correct[idx] = 0
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def to_bytes(self, exclude=tuple()): def to_bytes(self, exclude=tuple()):

View File

@ -521,29 +521,12 @@ class SentenceRecognizer(Tagger):
doc.c[j].sent_start = -1 doc.c[j].sent_start = -1
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores) labels = self.labels
tag_index = range(len(self.labels)) loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
cdef int idx = 0 truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
correct = numpy.zeros((scores.shape[0],), dtype="i") d_scores, loss = loss_func(scores, truths)
guesses = scores.argmax(axis=1) if self.model.ops.xp.isnan(loss):
known_labels = numpy.ones((scores.shape[0], 1), dtype="f") raise ValueError("nan value when computing loss")
for eg in examples:
sent_starts = eg.get_aligned("sent_start")
for sent_start in sent_starts:
if sent_start is None:
correct[idx] = guesses[idx]
elif sent_start in tag_index:
correct[idx] = sent_start
else:
correct[idx] = 0
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,

View File

@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
class ProjectConfigAsset(BaseModel): class ProjectConfigAsset(BaseModel):
# fmt: off # fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset") dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: StrictStr = Field(..., title="URL of asset") url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
# fmt: on # fmt: on

View File

@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
assert contains_cycle(tree) is None assert contains_cycle(tree) is None
assert contains_cycle(cyclic_tree) == set([3, 4, 5]) assert contains_cycle(cyclic_tree) == {3, 4, 5}
assert contains_cycle(partial_tree) is None assert contains_cycle(partial_tree) is None
assert contains_cycle(multirooted_tree) is None assert contains_cycle(multirooted_tree) is None

View File

@ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
from spacy.gold.converters import json2docs from spacy.gold.converters import json2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, minibatch from spacy.util import get_words_and_spaces, minibatch
from thinc.api import compounding from thinc.api import compounding
@ -272,72 +273,72 @@ def test_split_sentences(en_vocab):
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
spaces = [True, True, True, False, False] spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "U-LOC", "O"] assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person (len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", None, "O", "U-LOC", "O"] assert ner_tags == ["O", None, "O", "U-LOC", "O"]
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
spaces = [True, True, True, True, True, True, True, False, False] spaces = [True, True, True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
def test_gold_biluo_misaligned(en_vocab, en_tokenizer): def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
spaces = [True, True, True, True, True, False, False] spaces = [True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
@ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
assert spans[1].label_ == "GPE" assert spans[1].label_ == "GPE"
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr and Mrs Smith flew to "
entities = [
(0, len("Mr and Mrs Smith"), "PERSON"),
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
]
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
ents_ref = example.reference.ents
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
text = "Mr and Mrs Smith flew to San Francisco Valley"
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
{"label": "LOC", "pattern": "San Francisco Valley"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
prefix = "Mr and Mrs Smith flew to "
entities = [
(0, len("Mr and Mrs Smith"), "PERSON"),
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
]
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
ents_pred = example.predicted.ents
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
def test_gold_ner_missing_tags(en_tokenizer): def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.") doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
def test_projectivize(en_tokenizer):
doc = en_tokenizer("He pretty quickly walks away")
heads = [3, 2, 3, 0, 2]
example = Example.from_dict(doc, {"heads": heads})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
assert proj_heads == [3, 2, 3, 0, 3]
assert nonproj_heads == [3, 2, 3, 0, 2]
def test_iob_to_biluo(): def test_iob_to_biluo():
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]