mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
fb8a5967c1
|
@ -4,5 +4,4 @@ __version__ = "3.0.0a2"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
|
||||||
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Sequence, Union
|
from typing import Optional, Sequence
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_base_version, run_command
|
from ..util import is_package, get_base_version, run_command
|
||||||
|
|
||||||
|
# These are the old shortcuts we previously supported in spacy download. As of
|
||||||
|
# v3, shortcuts are deprecated so we're not expecting to add anything to this
|
||||||
|
# list. It only exists to show users warnings.
|
||||||
|
OLD_SHORTCUTS = {
|
||||||
|
"en": "en_core_web_sm",
|
||||||
|
"de": "de_core_news_sm",
|
||||||
|
"es": "es_core_news_sm",
|
||||||
|
"pt": "pt_core_news_sm",
|
||||||
|
"fr": "fr_core_news_sm",
|
||||||
|
"it": "it_core_news_sm",
|
||||||
|
"nl": "nl_core_news_sm",
|
||||||
|
"el": "el_core_news_sm",
|
||||||
|
"nb": "nb_core_news_sm",
|
||||||
|
"lt": "lt_core_news_sm",
|
||||||
|
"xx": "xx_ent_wiki_sm",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
"download",
|
"download",
|
||||||
|
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
model_name = model
|
||||||
model_name = shortcuts.get(model, model)
|
if model in OLD_SHORTCUTS:
|
||||||
|
msg.warn(
|
||||||
|
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
|
||||||
|
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
|
||||||
|
)
|
||||||
|
model_name = OLD_SHORTCUTS[model]
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
|
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_json(url: str, desc: str) -> Union[dict, list]:
|
def get_compatibility() -> dict:
|
||||||
r = requests.get(url)
|
version = get_base_version(about.__version__)
|
||||||
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Server error ({r.status_code})",
|
f"Server error ({r.status_code})",
|
||||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||||
f"installation (v{about.__version__}), and download it manually. "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
f"For more details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
f"https://spacy.io/usage/models",
|
f"https://spacy.io/usage/models",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return r.json()
|
comp_table = r.json()
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility() -> dict:
|
|
||||||
version = get_base_version(about.__version__)
|
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Dict, Any, Optional, Sequence
|
from typing import List, Dict, Any, Optional, Sequence, Union
|
||||||
import typer
|
import typer
|
||||||
import srsly
|
import srsly
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -18,7 +18,7 @@ from ..util import ensure_path, run_command, make_tempdir, working_dir
|
||||||
from ..util import get_hash, get_checksum, split_command
|
from ..util import get_hash, get_checksum, split_command
|
||||||
|
|
||||||
|
|
||||||
CONFIG_FILE = "project.yml"
|
PROJECT_FILE = "project.yml"
|
||||||
DVC_CONFIG = "dvc.yaml"
|
DVC_CONFIG = "dvc.yaml"
|
||||||
DVC_DIR = ".dvc"
|
DVC_DIR = ".dvc"
|
||||||
DIRS = [
|
DIRS = [
|
||||||
|
@ -38,12 +38,12 @@ CACHES = [
|
||||||
os.environ.get("TORCH_HOME"),
|
os.environ.get("TORCH_HOME"),
|
||||||
Path.home() / ".keras",
|
Path.home() / ".keras",
|
||||||
]
|
]
|
||||||
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
|
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit
|
||||||
# it directly and edit the project.yml instead and re-run the project."""
|
# it directly and edit the {PROJECT_FILE} instead and re-run the project."""
|
||||||
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
|
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
|
||||||
templates. You'd typically start by cloning a project template to a local
|
templates. You'd typically start by cloning a project template to a local
|
||||||
directory and fetching its assets like datasets etc. See the project's
|
directory and fetching its assets like datasets etc. See the project's
|
||||||
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
|
{PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
|
||||||
Version Control) to manage input and output files and to ensure steps are only
|
Version Control) to manage input and output files and to ensure steps are only
|
||||||
re-run if their inputs change.
|
re-run if their inputs change.
|
||||||
"""
|
"""
|
||||||
|
@ -91,7 +91,7 @@ def project_init_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
|
force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Initialize a project directory with DVC and optionally Git. This should
|
"""Initialize a project directory with DVC and optionally Git. This should
|
||||||
|
@ -100,7 +100,7 @@ def project_init_cli(
|
||||||
be a Git repo, it should be initialized with Git first, before initializing
|
be a Git repo, it should be initialized with Git first, before initializing
|
||||||
DVC. This allows DVC to integrate with Git.
|
DVC. This allows DVC to integrate with Git.
|
||||||
"""
|
"""
|
||||||
project_init(path, git=git, force=force, silent=True)
|
project_init(path, git=git, force=force)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
@project_cli.command("assets")
|
||||||
|
@ -110,11 +110,11 @@ def project_assets_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Use DVC (Data Version Control) to fetch project assets. Assets are
|
"""Use DVC (Data Version Control) to fetch project assets. Assets are
|
||||||
defined in the "assets" section of the project config. If possible, DVC
|
defined in the "assets" section of the project.yml. If possible, DVC
|
||||||
will try to track the files so you can pull changes from upstream. It will
|
will try to track the files so you can pull changes from upstream. It will
|
||||||
also try and store the checksum so the assets are versioned. If the file
|
also try and store the checksum so the assets are versioned. If the file
|
||||||
can't be tracked or checked, it will be downloaded without DVC. If a checksum
|
can't be tracked or checked, it will be downloaded without DVC. If a checksum
|
||||||
is provided in the project config, the file is only downloaded if no local
|
is provided in the project.yml, the file is only downloaded if no local
|
||||||
file with the same checksum exists.
|
file with the same checksum exists.
|
||||||
"""
|
"""
|
||||||
project_assets(project_dir)
|
project_assets(project_dir)
|
||||||
|
@ -132,7 +132,7 @@ def project_run_all_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Run all commands defined in the project. This command will use DVC and
|
"""Run all commands defined in the project. This command will use DVC and
|
||||||
the defined outputs and dependencies in the project config to determine
|
the defined outputs and dependencies in the project.yml to determine
|
||||||
which steps need to be re-run and where to start. This means you're only
|
which steps need to be re-run and where to start. This means you're only
|
||||||
re-generating data if the inputs have changed.
|
re-generating data if the inputs have changed.
|
||||||
|
|
||||||
|
@ -151,12 +151,12 @@ def project_run_all_cli(
|
||||||
def project_run_cli(
|
def project_run_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context,
|
ctx: typer.Context,
|
||||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Run a named script defined in the project config. If the command is
|
"""Run a named script defined in the project.yml. If the command is
|
||||||
part of the default pipeline defined in the "run" section, DVC is used to
|
part of the default pipeline defined in the "run" section, DVC is used to
|
||||||
determine whether the step should re-run if its inputs have changed, or
|
determine whether the step should re-run if its inputs have changed, or
|
||||||
whether everything is up to date. If the script is not part of the default
|
whether everything is up to date. If the script is not part of the default
|
||||||
|
@ -175,13 +175,13 @@ def project_run_cli(
|
||||||
@project_cli.command("exec", hidden=True)
|
@project_cli.command("exec", hidden=True)
|
||||||
def project_exec_cli(
|
def project_exec_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Execute a command defined in the project config. This CLI command is
|
"""Execute a command defined in the project.yml. This CLI command is
|
||||||
only called internally in auto-generated DVC pipelines, as a shortcut for
|
only called internally in auto-generated DVC pipelines, as a shortcut for
|
||||||
multi-step commands in the project config. You typically shouldn't have to
|
multi-step commands in the project.yml. You typically shouldn't have to
|
||||||
call it yourself. To run a command, call "run" or "run-all".
|
call it yourself. To run a command, call "run" or "run-all".
|
||||||
"""
|
"""
|
||||||
project_exec(project_dir, subcommand)
|
project_exec(project_dir, subcommand)
|
||||||
|
@ -196,15 +196,15 @@ def project_update_dvc_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
||||||
"run" section of the project config. This typically happens automatically
|
"run" section of the project.yml. This typically happens automatically
|
||||||
when running a command, but can also be triggered manually if needed.
|
when running a command, but can also be triggered manually if needed.
|
||||||
"""
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
||||||
if updated:
|
if updated:
|
||||||
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
msg.good(f"Updated DVC config from {PROJECT_FILE}")
|
||||||
else:
|
else:
|
||||||
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
msg.info(f"No changes found in {PROJECT_FILE}, no update needed")
|
||||||
|
|
||||||
|
|
||||||
app.add_typer(project_cli, name="project")
|
app.add_typer(project_cli, name="project")
|
||||||
|
@ -241,7 +241,7 @@ def project_clone(
|
||||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||||
try:
|
try:
|
||||||
run_command(cmd)
|
run_command(cmd)
|
||||||
except SystemExit:
|
except DVCError:
|
||||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||||
msg.fail(err)
|
msg.fail(err)
|
||||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||||
|
@ -249,7 +249,7 @@ def project_clone(
|
||||||
try:
|
try:
|
||||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||||
except SystemExit:
|
except DVCError:
|
||||||
err = f"Could not clone '{name}' in the repo '{repo}'."
|
err = f"Could not clone '{name}' in the repo '{repo}'."
|
||||||
msg.fail(err)
|
msg.fail(err)
|
||||||
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
||||||
|
@ -282,27 +282,29 @@ def project_init(
|
||||||
with working_dir(project_dir) as cwd:
|
with working_dir(project_dir) as cwd:
|
||||||
if git:
|
if git:
|
||||||
run_command(["git", "init"])
|
run_command(["git", "init"])
|
||||||
init_cmd = ["dvc", "init"]
|
flags = {"--force": force, "--quiet": silent, "--no-scm": not git}
|
||||||
if silent:
|
try:
|
||||||
init_cmd.append("--quiet")
|
run_dvc_command(["init"], flags=flags)
|
||||||
if not git:
|
except DVCError:
|
||||||
init_cmd.append("--no-scm")
|
msg.fail(
|
||||||
if force:
|
"Failed to initialize project. This likely means that the "
|
||||||
init_cmd.append("--force")
|
"project is already initialized and has a .dvc directory. "
|
||||||
run_command(init_cmd)
|
"To force-initialize, use the --force flag.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
# We don't want to have analytics on by default – our users should
|
# We don't want to have analytics on by default – our users should
|
||||||
# opt-in explicitly. If they want it, they can always enable it.
|
# opt-in explicitly. If they want it, they can always enable it.
|
||||||
if not analytics:
|
if not analytics:
|
||||||
run_command(["dvc", "config", "core.analytics", "false"])
|
run_dvc_command(["config", "core.analytics", "false"])
|
||||||
# Remove unused and confusing plot templates from .dvc directory
|
# Remove unused and confusing plot templates from .dvc directory.
|
||||||
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
|
# Otherwise super confusing once you commit your changes via Git and it
|
||||||
# once you commit your changes via Git and it creates a bunch of files
|
# creates a bunch of files that have no purpose.
|
||||||
# that have no purpose
|
|
||||||
plots_dir = cwd / DVC_DIR / "plots"
|
plots_dir = cwd / DVC_DIR / "plots"
|
||||||
if plots_dir.exists():
|
if plots_dir.exists():
|
||||||
shutil.rmtree(str(plots_dir))
|
shutil.rmtree(str(plots_dir))
|
||||||
config = load_project_config(cwd)
|
config = load_project_config(cwd)
|
||||||
setup_check_dvc(cwd, config)
|
setup_check_dvc(cwd, config)
|
||||||
|
msg.good("Initialized project")
|
||||||
|
|
||||||
|
|
||||||
def project_assets(project_dir: Path) -> None:
|
def project_assets(project_dir: Path) -> None:
|
||||||
|
@ -315,19 +317,33 @@ def project_assets(project_dir: Path) -> None:
|
||||||
setup_check_dvc(project_path, config)
|
setup_check_dvc(project_path, config)
|
||||||
assets = config.get("assets", {})
|
assets = config.get("assets", {})
|
||||||
if not assets:
|
if not assets:
|
||||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
fetched_assets = []
|
fetched_assets = []
|
||||||
for asset in assets:
|
for asset in assets:
|
||||||
url = asset["url"].format(**variables)
|
|
||||||
dest = asset["dest"].format(**variables)
|
dest = asset["dest"].format(**variables)
|
||||||
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
|
url = asset.get("url")
|
||||||
|
checksum = asset.get("checksum")
|
||||||
|
if not url:
|
||||||
|
# project.yml defines asset without URL that the user has to place
|
||||||
|
if not Path(dest).exists():
|
||||||
|
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||||
|
msg.warn(err)
|
||||||
|
else:
|
||||||
|
if checksum == get_checksum(dest):
|
||||||
|
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||||
|
fetched_assets.append((project_path / dest).resolve())
|
||||||
|
else:
|
||||||
|
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||||
|
continue
|
||||||
|
url = url.format(**variables)
|
||||||
|
fetched_path = fetch_asset(project_path, url, dest, checksum)
|
||||||
if fetched_path:
|
if fetched_path:
|
||||||
fetched_assets.append(str(fetched_path))
|
fetched_assets.append(str(fetched_path))
|
||||||
if fetched_assets:
|
if fetched_assets:
|
||||||
with working_dir(project_path):
|
with working_dir(project_path):
|
||||||
run_command(["dvc", "add", *fetched_assets, "--external"])
|
run_dvc_command(["add", *fetched_assets, "--external"])
|
||||||
|
|
||||||
|
|
||||||
def fetch_asset(
|
def fetch_asset(
|
||||||
|
@ -359,19 +375,17 @@ def fetch_asset(
|
||||||
# Try with tracking the source first, then just downloading with
|
# Try with tracking the source first, then just downloading with
|
||||||
# DVC, then a regular non-DVC download.
|
# DVC, then a regular non-DVC download.
|
||||||
try:
|
try:
|
||||||
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
run_dvc_command(["import-url", url, str(dest_path)])
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
except DVCError:
|
||||||
except subprocess.CalledProcessError:
|
run_dvc_command(["get-url", url, str(dest_path)])
|
||||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
except DVCError:
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
try:
|
try:
|
||||||
download_file(url, dest_path)
|
download_file(url, dest_path)
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
msg.fail(f"Download failed: {dest}", e)
|
||||||
return None
|
return None
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
if checksum and checksum != get_checksum(dest_path):
|
||||||
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||||
msg.good(f"Fetched asset {dest}")
|
msg.good(f"Fetched asset {dest}")
|
||||||
return dest_path
|
return dest_path
|
||||||
|
|
||||||
|
@ -384,13 +398,17 @@ def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||||
"""
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
setup_check_dvc(project_dir, config)
|
setup_check_dvc(project_dir, config)
|
||||||
dvc_cmd = ["dvc", "repro", *dvc_args]
|
|
||||||
with working_dir(project_dir):
|
with working_dir(project_dir):
|
||||||
run_command(dvc_cmd)
|
try:
|
||||||
|
run_dvc_command(["repro", *dvc_args])
|
||||||
|
except DVCError:
|
||||||
|
# We could raise a custom error here, but the output produced by
|
||||||
|
# DVC is already pretty substantial.
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
"""Simulate a CLI help prompt using the info available in the project config.
|
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
project_dir (Path): The project directory.
|
||||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||||
|
@ -408,15 +426,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
if help_text:
|
if help_text:
|
||||||
msg.text(f"\n{help_text}\n")
|
msg.text(f"\n{help_text}\n")
|
||||||
else:
|
else:
|
||||||
print(f"\nAvailable commands in {CONFIG_FILE}")
|
print(f"\nAvailable commands in {PROJECT_FILE}")
|
||||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
|
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||||
msg.text("Run all commands defined in the 'run' block of the project config:")
|
msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
|
||||||
print(f"{COMMAND} project run-all {project_dir}")
|
print(f"{COMMAND} project run-all {project_dir}")
|
||||||
|
|
||||||
|
|
||||||
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
"""Run a named script defined in the project config. If the script is part
|
"""Run a named script defined in the project.yml. If the script is part
|
||||||
of the default pipeline (defined in the "run" section), DVC is used to
|
of the default pipeline (defined in the "run" section), DVC is used to
|
||||||
execute the command, so it can determine whether to rerun it. It then
|
execute the command, so it can determine whether to rerun it. It then
|
||||||
calls into "exec" to execute it.
|
calls into "exec" to execute it.
|
||||||
|
@ -433,9 +451,13 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
validate_subcommand(commands.keys(), subcommand)
|
validate_subcommand(commands.keys(), subcommand)
|
||||||
if subcommand in config.get("run", []):
|
if subcommand in config.get("run", []):
|
||||||
# This is one of the pipeline commands tracked in DVC
|
# This is one of the pipeline commands tracked in DVC
|
||||||
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
|
||||||
with working_dir(project_dir):
|
with working_dir(project_dir):
|
||||||
run_command(dvc_cmd)
|
try:
|
||||||
|
run_dvc_command(["repro", subcommand, *dvc_args])
|
||||||
|
except DVCError:
|
||||||
|
# We could raise a custom error here, but the output produced by
|
||||||
|
# DVC is already pretty substantial.
|
||||||
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
cmd = commands[subcommand]
|
cmd = commands[subcommand]
|
||||||
# Deps in non-DVC commands aren't tracked, but if they're defined,
|
# Deps in non-DVC commands aren't tracked, but if they're defined,
|
||||||
|
@ -448,8 +470,8 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
run_commands(cmd["script"], variables)
|
run_commands(cmd["script"], variables)
|
||||||
|
|
||||||
|
|
||||||
def project_exec(project_dir: Path, subcommand: str):
|
def project_exec(project_dir: Path, subcommand: str) -> None:
|
||||||
"""Execute a command defined in the project config.
|
"""Execute a command defined in the project.yml.
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
project_dir (Path): Path to project directory.
|
||||||
subcommand (str): Name of command to run.
|
subcommand (str): Name of command to run.
|
||||||
|
@ -468,15 +490,15 @@ def project_exec(project_dir: Path, subcommand: str):
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
"""Load the project config file from a directory and validate it.
|
"""Load the project.yml file from a directory and validate it.
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
path (Path): The path to the project directory.
|
||||||
RETURNS (Dict[str, Any]): The loaded project config.
|
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||||
"""
|
"""
|
||||||
config_path = path / CONFIG_FILE
|
config_path = path / PROJECT_FILE
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Can't find project config", config_path, exits=1)
|
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||||
invalid_err = f"Invalid project config in {CONFIG_FILE}"
|
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||||
try:
|
try:
|
||||||
config = srsly.read_yaml(config_path)
|
config = srsly.read_yaml(config_path)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
@ -500,7 +522,7 @@ def update_dvc_config(
|
||||||
dict, so if any of the config values change, the DVC config is regenerated.
|
dict, so if any of the config values change, the DVC config is regenerated.
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
path (Path): The path to the project directory.
|
||||||
config (Dict[str, Any]): The loaded project config.
|
config (Dict[str, Any]): The loaded project.yml.
|
||||||
verbose (bool): Whether to print additional info (via DVC).
|
verbose (bool): Whether to print additional info (via DVC).
|
||||||
silent (bool): Don't output anything (via DVC).
|
silent (bool): Don't output anything (via DVC).
|
||||||
force (bool): Force update, even if hashes match.
|
force (bool): Force update, even if hashes match.
|
||||||
|
@ -514,10 +536,10 @@ def update_dvc_config(
|
||||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||||
ref_hash = f.readline().strip().replace("# ", "")
|
ref_hash = f.readline().strip().replace("# ", "")
|
||||||
if ref_hash == config_hash and not force:
|
if ref_hash == config_hash and not force:
|
||||||
return False # Nothing has changed in project config, don't need to update
|
return False # Nothing has changed in project.yml, don't need to update
|
||||||
dvc_config_path.unlink()
|
dvc_config_path.unlink()
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
commands = []
|
dvc_commands = []
|
||||||
# We only want to include commands that are part of the main list of "run"
|
# We only want to include commands that are part of the main list of "run"
|
||||||
# commands in project.yml and should be run in sequence
|
# commands in project.yml and should be run in sequence
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
@ -535,15 +557,12 @@ def update_dvc_config(
|
||||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||||
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
|
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||||
if verbose:
|
|
||||||
dvc_cmd.append("--verbose")
|
|
||||||
if silent:
|
|
||||||
dvc_cmd.append("--quiet")
|
|
||||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||||
commands.append(" ".join(full_cmd))
|
dvc_commands.append(" ".join(full_cmd))
|
||||||
with working_dir(path):
|
with working_dir(path):
|
||||||
run_commands(commands, variables, silent=True)
|
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||||
|
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
f.seek(0, 0)
|
f.seek(0, 0)
|
||||||
|
@ -571,7 +590,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
||||||
DVC project.
|
DVC project.
|
||||||
|
|
||||||
project_dir (Path): The path to the project directory.
|
project_dir (Path): The path to the project directory.
|
||||||
config (Dict[str, Any]): The loaded project config.
|
config (Dict[str, Any]): The loaded project.yml.
|
||||||
"""
|
"""
|
||||||
if not project_dir.exists():
|
if not project_dir.exists():
|
||||||
msg.fail(f"Can't find project directory: {project_dir}")
|
msg.fail(f"Can't find project directory: {project_dir}")
|
||||||
|
@ -586,38 +605,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
||||||
with msg.loading("Updating DVC config..."):
|
with msg.loading("Updating DVC config..."):
|
||||||
updated = update_dvc_config(project_dir, config, silent=True)
|
updated = update_dvc_config(project_dir, config, silent=True)
|
||||||
if updated:
|
if updated:
|
||||||
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
msg.good(f"Updated DVC config from changed {PROJECT_FILE}")
|
||||||
|
|
||||||
|
|
||||||
def run_commands(
|
|
||||||
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
|
||||||
) -> None:
|
|
||||||
"""Run a sequence of commands in a subprocess, in order.
|
|
||||||
|
|
||||||
commands (List[str]): The string commands.
|
|
||||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
|
||||||
values. Will be used to substitute format string variables in the
|
|
||||||
commands.
|
|
||||||
silent (bool): Don't print the commands.
|
|
||||||
"""
|
|
||||||
for command in commands:
|
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
|
||||||
command = command.format(**variables)
|
|
||||||
command = split_command(command)
|
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
|
||||||
# use commands in their config that reference "python" and we want to
|
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
|
||||||
# executed with and the pip in the same env, not some other Python/pip.
|
|
||||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
|
||||||
# that's how it's set up on their system), and user 2 without the
|
|
||||||
# shortcut tries to re-run the command.
|
|
||||||
if len(command) and command[0] in ("python", "python3"):
|
|
||||||
command[0] = sys.executable
|
|
||||||
elif len(command) and command[0] in ("pip", "pip3"):
|
|
||||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
|
||||||
if not silent:
|
|
||||||
print(f"Running command: {' '.join(command)}")
|
|
||||||
run_command(command)
|
|
||||||
|
|
||||||
|
|
||||||
def convert_asset_url(url: str) -> str:
|
def convert_asset_url(url: str) -> str:
|
||||||
|
@ -627,7 +615,7 @@ def convert_asset_url(url: str) -> str:
|
||||||
RETURNS (str): The converted URL.
|
RETURNS (str): The converted URL.
|
||||||
"""
|
"""
|
||||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||||
if re.match("(http(s?)):\/\/github.com", url):
|
if re.match(r"(http(s?)):\/\/github.com", url):
|
||||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -679,7 +667,7 @@ def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
|
||||||
"""
|
"""
|
||||||
if subcommand not in commands:
|
if subcommand not in commands:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
|
f"Can't find command '{subcommand}' in {PROJECT_FILE}. "
|
||||||
f"Available commands: {', '.join(commands)}",
|
f"Available commands: {', '.join(commands)}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -706,3 +694,112 @@ def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||||
for data in response.iter_content(chunk_size=chunk_size):
|
for data in response.iter_content(chunk_size=chunk_size):
|
||||||
size = f.write(data)
|
size = f.write(data)
|
||||||
bar.update(size)
|
bar.update(size)
|
||||||
|
|
||||||
|
|
||||||
|
def run_commands(
|
||||||
|
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""Run a sequence of commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The string commands.
|
||||||
|
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
silent (bool): Don't print the commands.
|
||||||
|
"""
|
||||||
|
for command in commands:
|
||||||
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
|
command = command.format(**variables)
|
||||||
|
command = split_command(command)
|
||||||
|
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||||
|
# use commands in their config that reference "python" and we want to
|
||||||
|
# make sure that it's always executing the same Python that spaCy is
|
||||||
|
# executed with and the pip in the same env, not some other Python/pip.
|
||||||
|
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||||
|
# that's how it's set up on their system), and user 2 without the
|
||||||
|
# shortcut tries to re-run the command.
|
||||||
|
if len(command) and command[0] in ("python", "python3"):
|
||||||
|
command[0] = sys.executable
|
||||||
|
elif len(command) and command[0] in ("pip", "pip3"):
|
||||||
|
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||||
|
if not silent:
|
||||||
|
print(f"Running command: {' '.join(command)}")
|
||||||
|
run_command(command)
|
||||||
|
|
||||||
|
|
||||||
|
def run_dvc_commands(
|
||||||
|
commands: List[str] = tuple(),
|
||||||
|
variables: Dict[str, str] = {},
|
||||||
|
flags: Dict[str, bool] = {},
|
||||||
|
) -> None:
|
||||||
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The string commands without the leading "dvc".
|
||||||
|
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||||
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
|
"""
|
||||||
|
for command in commands:
|
||||||
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
|
command = command.format(**variables)
|
||||||
|
command = split_command(command)
|
||||||
|
run_dvc_command(command, flags=flags)
|
||||||
|
|
||||||
|
|
||||||
|
def run_dvc_command(
|
||||||
|
command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""Run a DVC command in a subprocess. This wrapper gives us a bit more
|
||||||
|
control over how the output and errors are presented. Raises a DVC error if
|
||||||
|
the "dvc" command returns a non-zero exit code and uses the error message
|
||||||
|
logged by DVC.
|
||||||
|
|
||||||
|
command (Union[str, List[str]]): The command, without the leading "dvc".
|
||||||
|
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||||
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
|
silent (bool): Don't print any output.
|
||||||
|
"""
|
||||||
|
if isinstance(command, str):
|
||||||
|
command = split_command(command)
|
||||||
|
dvc_command = ["dvc", *command]
|
||||||
|
# Add the flags if they are set to True
|
||||||
|
for flag, is_active in flags.items():
|
||||||
|
if is_active:
|
||||||
|
dvc_command.append(flag)
|
||||||
|
proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||||
|
if not silent:
|
||||||
|
lines = proc.stdout.read().decode("utf8").split("\n\n")
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if is_relevant_dvc_output(line):
|
||||||
|
print(f"{line}\n")
|
||||||
|
_, err = proc.communicate() # Important: otherwise returncode will be None!
|
||||||
|
if proc.returncode != 0:
|
||||||
|
if isinstance(err, bytes):
|
||||||
|
err = err.decode("utf8")
|
||||||
|
raise DVCError(err)
|
||||||
|
|
||||||
|
|
||||||
|
def is_relevant_dvc_output(line: str) -> bool:
|
||||||
|
"""Check whether the output by DVC is something we want to keep.
|
||||||
|
|
||||||
|
line (str): A line written to stdout,.
|
||||||
|
RETURNS (bool): Whether to use/print the line.
|
||||||
|
"""
|
||||||
|
# Writing them like this for readability but maybe replace with regex?
|
||||||
|
conditions = [
|
||||||
|
not line,
|
||||||
|
line.startswith("What's next?"),
|
||||||
|
line.startswith("Having any troubles?"),
|
||||||
|
]
|
||||||
|
return not any(conditions)
|
||||||
|
|
||||||
|
|
||||||
|
class DVCError(RuntimeError):
|
||||||
|
"""Custom error type for anything produced by the DVC CLI."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
|
@ -477,15 +477,14 @@ class Errors(object):
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||||
"array and {doc_length} for the Doc itself.")
|
"array and {doc_length} for the Doc itself.")
|
||||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||||
E973 = ("Unexpected type for NER data")
|
E973 = ("Unexpected type for NER data")
|
||||||
E974 = ("Unknown {obj} attribute: {key}")
|
E974 = ("Unknown {obj} attribute: {key}")
|
||||||
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
|
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||||
"but got {type}")
|
|
||||||
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
|
|
||||||
"but received None.")
|
"but received None.")
|
||||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
|
|
|
@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||||
if reference is None:
|
if reference is None:
|
||||||
|
@ -59,17 +58,15 @@ cdef class Example:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, Doc predicted, dict example_dict):
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
|
if predicted is None:
|
||||||
|
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||||
if example_dict is None:
|
if example_dict is None:
|
||||||
raise ValueError(Errors.E976)
|
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||||
if not isinstance(predicted, Doc):
|
|
||||||
raise TypeError(Errors.E975.format(type=type(predicted)))
|
|
||||||
example_dict = _fix_legacy_dict_data(example_dict)
|
example_dict = _fix_legacy_dict_data(example_dict)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
if not _has_field(tok_dict, "SPACY"):
|
|
||||||
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
|
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
|
@ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
try:
|
||||||
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
|
except TypeError:
|
||||||
|
types= set([type(v) for v in value])
|
||||||
|
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||||
|
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
|
@ -272,7 +272,7 @@ cdef class Morphology:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def feats_to_dict(feats):
|
def feats_to_dict(feats):
|
||||||
if not feats:
|
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||||
return {}
|
return {}
|
||||||
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
||||||
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
||||||
|
|
|
@ -3,7 +3,7 @@ cimport numpy as np
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import to_categorical
|
from thinc.api import SequenceCategoricalCrossentropy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
|
||||||
doc.is_morphed = True
|
doc.is_morphed = True
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
truths = []
|
||||||
cdef int idx = 0
|
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
|
||||||
guesses = scores.argmax(axis=1)
|
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
eg_truths = []
|
||||||
pos_tags = eg.get_aligned("POS", as_string=True)
|
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||||
morphs = eg.get_aligned("MORPH", as_string=True)
|
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||||
for i in range(len(morphs)):
|
for i in range(len(morphs)):
|
||||||
|
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
|
||||||
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
||||||
if morph == "":
|
if morph == "":
|
||||||
morph = Morphology.EMPTY_MORPH
|
morph = Morphology.EMPTY_MORPH
|
||||||
if morph is None:
|
eg_truths.append(morph)
|
||||||
correct[idx] = guesses[idx]
|
truths.append(eg_truths)
|
||||||
elif morph in tag_index:
|
d_scores, loss = loss_func(scores, truths)
|
||||||
correct[idx] = tag_index[morph]
|
if self.model.ops.xp.isnan(loss):
|
||||||
else:
|
raise ValueError("nan value when computing loss")
|
||||||
correct[idx] = 0
|
|
||||||
known_labels[idx] = 0.
|
|
||||||
idx += 1
|
|
||||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
|
||||||
loss = (d_scores**2).sum()
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
|
|
@ -521,29 +521,12 @@ class SentenceRecognizer(Tagger):
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
labels = self.labels
|
||||||
tag_index = range(len(self.labels))
|
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||||
cdef int idx = 0
|
truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
d_scores, loss = loss_func(scores, truths)
|
||||||
guesses = scores.argmax(axis=1)
|
if self.model.ops.xp.isnan(loss):
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
raise ValueError("nan value when computing loss")
|
||||||
for eg in examples:
|
|
||||||
sent_starts = eg.get_aligned("sent_start")
|
|
||||||
for sent_start in sent_starts:
|
|
||||||
if sent_start is None:
|
|
||||||
correct[idx] = guesses[idx]
|
|
||||||
elif sent_start in tag_index:
|
|
||||||
correct[idx] = sent_start
|
|
||||||
else:
|
|
||||||
correct[idx] = 0
|
|
||||||
known_labels[idx] = 0.
|
|
||||||
idx += 1
|
|
||||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
|
||||||
loss = (d_scores**2).sum()
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
|
|
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
|
||||||
class ProjectConfigAsset(BaseModel):
|
class ProjectConfigAsset(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
url: StrictStr = Field(..., title="URL of asset")
|
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
|
|
||||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
assert contains_cycle(tree) is None
|
assert contains_cycle(tree) is None
|
||||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
assert contains_cycle(cyclic_tree) == {3, 4, 5}
|
||||||
assert contains_cycle(partial_tree) is None
|
assert contains_cycle(partial_tree) is None
|
||||||
assert contains_cycle(multirooted_tree) is None
|
assert contains_cycle(multirooted_tree) is None
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.gold.example import Example
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.gold.converters import json2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding
|
||||||
|
@ -272,72 +273,72 @@ def test_split_sentences(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
spaces = [True, True, True, False, False]
|
spaces = [True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person
|
(len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
@ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
|
||||||
assert spans[1].label_ == "GPE"
|
assert spans[1].label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
|
||||||
|
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
|
||||||
|
spaces = [True, True, True, False, False]
|
||||||
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
ents_ref = example.reference.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
|
||||||
|
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
|
||||||
|
text = "Mr and Mrs Smith flew to San Francisco Valley"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
|
||||||
|
{"label": "LOC", "pattern": "San Francisco Valley"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
|
||||||
|
ents_pred = example.predicted.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
|
||||||
|
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_ner_missing_tags(en_tokenizer):
|
def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||||
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
|
@ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_projectivize(en_tokenizer):
|
||||||
|
doc = en_tokenizer("He pretty quickly walks away")
|
||||||
|
heads = [3, 2, 3, 0, 2]
|
||||||
|
example = Example.from_dict(doc, {"heads": heads})
|
||||||
|
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
|
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||||
|
assert proj_heads == [3, 2, 3, 0, 3]
|
||||||
|
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||||
|
|
||||||
|
|
||||||
def test_iob_to_biluo():
|
def test_iob_to_biluo():
|
||||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user