mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
028f8210e8
|
@ -9,27 +9,28 @@ max_length = 5000
|
|||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.2
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
patience = 100000
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 500
|
||||
max_steps = 0
|
||||
eval_frequency = 1000
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
accumulate_gradient = 2
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1.0}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
discard_oversize = true
|
||||
omit_extra_lookups = false
|
||||
batch_by_words = true
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
|
@ -37,18 +38,18 @@ compound = 1.001
|
|||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = false
|
||||
L2 = 1e-6
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
#[optimizer.learn_rate]
|
||||
#[training.optimizer.learn_rate]
|
||||
#@schedules = "warmup_linear.v1"
|
||||
#warmup_steps = 250
|
||||
#total_steps = 20000
|
||||
#initial_rate = 0.001
|
||||
#warmup_steps = 1000
|
||||
#total_steps = 50000
|
||||
#initial_rate = 0.003
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
|
@ -58,8 +59,6 @@ vectors = null
|
|||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
|
@ -75,6 +74,6 @@ width = 96
|
|||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
maxout_pieces = 1
|
||||
subword_features = true
|
||||
dropout = ${training:dropout}
|
||||
|
|
|
@ -7,6 +7,7 @@ requires = [
|
|||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a12,<8.0.0a20",
|
||||
"blis>=0.4.0,<0.5.0"
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -14,6 +14,7 @@ numpy>=1.15.0
|
|||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
|
|
|
@ -51,6 +51,7 @@ install_requires =
|
|||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
|
|
3
setup.py
3
setup.py
|
@ -1,11 +1,11 @@
|
|||
#!/usr/bin/env python
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import sys
|
||||
import platform
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.sysconfig import get_python_inc
|
||||
import distutils.util
|
||||
from distutils import ccompiler, msvccompiler
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
@ -23,7 +23,6 @@ Options.docstrings = True
|
|||
|
||||
PACKAGES = find_packages()
|
||||
MOD_NAMES = [
|
||||
"spacy.gold.align",
|
||||
"spacy.gold.example",
|
||||
"spacy.parts_of_speech",
|
||||
"spacy.strings",
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a1"
|
||||
__version__ = "3.0.0a2"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||
|
|
|
@ -15,8 +15,10 @@ from .evaluate import evaluate # noqa: F401
|
|||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project import project_clone, project_assets, project_run # noqa: F401
|
||||
from .project import project_run_all # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
|
|||
|
||||
DOCS: https://spacy.io/api/cli
|
||||
"""
|
||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
|
||||
project templates. You'd typically start by cloning a project template to a local
|
||||
directory and fetching its assets like datasets etc. See the project's
|
||||
project.yml for the available commands.
|
||||
"""
|
||||
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
app.add_typer(project_cli)
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Sequence, Union
|
||||
from typing import Optional, Sequence
|
||||
import requests
|
||||
import sys
|
||||
from wasabi import msg
|
||||
|
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
|
|||
from .. import about
|
||||
from ..util import is_package, get_base_version, run_command
|
||||
|
||||
# These are the old shortcuts we previously supported in spacy download. As of
|
||||
# v3, shortcuts are deprecated so we're not expecting to add anything to this
|
||||
# list. It only exists to show users warnings.
|
||||
OLD_SHORTCUTS = {
|
||||
"en": "en_core_web_sm",
|
||||
"de": "de_core_news_sm",
|
||||
"es": "es_core_news_sm",
|
||||
"pt": "pt_core_news_sm",
|
||||
"fr": "fr_core_news_sm",
|
||||
"it": "it_core_news_sm",
|
||||
"nl": "nl_core_news_sm",
|
||||
"el": "el_core_news_sm",
|
||||
"nb": "nb_core_news_sm",
|
||||
"lt": "lt_core_news_sm",
|
||||
"xx": "xx_ent_wiki_sm",
|
||||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
"download",
|
||||
|
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
|||
version = components[-1]
|
||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
else:
|
||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||
model_name = shortcuts.get(model, model)
|
||||
model_name = model
|
||||
if model in OLD_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
|
||||
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
|
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
|||
)
|
||||
|
||||
|
||||
def get_json(url: str, desc: str) -> Union[dict, list]:
|
||||
r = requests.get(url)
|
||||
def get_compatibility() -> dict:
|
||||
version = get_base_version(about.__version__)
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
f"Server error ({r.status_code})",
|
||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||
f"installation (v{about.__version__}), and download it manually. "
|
||||
f"For more details, see the documentation: "
|
||||
f"https://spacy.io/usage/models",
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_compatibility() -> dict:
|
||||
version = get_base_version(about.__version__)
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp_table = r.json()
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||
|
|
|
@ -1,708 +0,0 @@
|
|||
from typing import List, Dict, Any, Optional, Sequence
|
||||
import typer
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import requests
|
||||
import tqdm
|
||||
|
||||
from ._app import app, Arg, Opt, COMMAND, NAME
|
||||
from .. import about
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
||||
from ..util import get_hash, get_checksum, split_command
|
||||
|
||||
|
||||
CONFIG_FILE = "project.yml"
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
DIRS = [
|
||||
"assets",
|
||||
"metas",
|
||||
"configs",
|
||||
"packages",
|
||||
"metrics",
|
||||
"scripts",
|
||||
"notebooks",
|
||||
"training",
|
||||
"corpus",
|
||||
]
|
||||
CACHES = [
|
||||
Path.home() / ".torch",
|
||||
Path.home() / ".caches" / "torch",
|
||||
os.environ.get("TORCH_HOME"),
|
||||
Path.home() / ".keras",
|
||||
]
|
||||
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
|
||||
# it directly and edit the project.yml instead and re-run the project."""
|
||||
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
|
||||
templates. You'd typically start by cloning a project template to a local
|
||||
directory and fetching its assets like datasets etc. See the project's
|
||||
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
|
||||
Version Control) to manage input and output files and to ensure steps are only
|
||||
re-run if their inputs change.
|
||||
"""
|
||||
|
||||
project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
|
||||
|
||||
|
||||
@project_cli.callback(invoke_without_command=True)
|
||||
def callback(ctx: typer.Context):
|
||||
"""This runs before every project command and ensures DVC is installed."""
|
||||
ensure_dvc()
|
||||
|
||||
|
||||
################
|
||||
# CLI COMMANDS #
|
||||
################
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to fetch"),
|
||||
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo). Setting the --git flag will also
|
||||
initialize the project directory as a Git repo. If the project is intended
|
||||
to be a Git repo, it should be initialized with Git first, before
|
||||
initializing DVC (Data Version Control). This allows DVC to integrate with
|
||||
Git.
|
||||
"""
|
||||
if dest == Path.cwd():
|
||||
dest = dest / name
|
||||
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
|
||||
|
||||
|
||||
@project_cli.command("init")
|
||||
def project_init_cli(
|
||||
# fmt: off
|
||||
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Initialize a project directory with DVC and optionally Git. This should
|
||||
typically be taken care of automatically when you run the "project clone"
|
||||
command, but you can also run it separately. If the project is intended to
|
||||
be a Git repo, it should be initialized with Git first, before initializing
|
||||
DVC. This allows DVC to integrate with Git.
|
||||
"""
|
||||
project_init(path, git=git, force=force, silent=True)
|
||||
|
||||
|
||||
@project_cli.command("assets")
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Use DVC (Data Version Control) to fetch project assets. Assets are
|
||||
defined in the "assets" section of the project config. If possible, DVC
|
||||
will try to track the files so you can pull changes from upstream. It will
|
||||
also try and store the checksum so the assets are versioned. If the file
|
||||
can't be tracked or checked, it will be downloaded without DVC. If a checksum
|
||||
is provided in the project config, the file is only downloaded if no local
|
||||
file with the same checksum exists.
|
||||
"""
|
||||
project_assets(project_dir)
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run-all",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_run_all_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run all commands defined in the project. This command will use DVC and
|
||||
the defined outputs and dependencies in the project config to determine
|
||||
which steps need to be re-run and where to start. This means you're only
|
||||
re-generating data if the inputs have changed.
|
||||
|
||||
This command calls into "dvc repro" and all additional arguments are passed
|
||||
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
|
||||
"""
|
||||
if show_help:
|
||||
print_run_help(project_dir)
|
||||
else:
|
||||
project_run_all(project_dir, *ctx.args)
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named script defined in the project config. If the command is
|
||||
part of the default pipeline defined in the "run" section, DVC is used to
|
||||
determine whether the step should re-run if its inputs have changed, or
|
||||
whether everything is up to date. If the script is not part of the default
|
||||
pipeline, it will be called separately without DVC.
|
||||
|
||||
If DVC is used, the command calls into "dvc repro" and all additional
|
||||
arguments are passed to the "dvc repro" command:
|
||||
https://dvc.org/doc/command-reference/repro
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
project_run(project_dir, subcommand, *ctx.args)
|
||||
|
||||
|
||||
@project_cli.command("exec", hidden=True)
|
||||
def project_exec_cli(
|
||||
# fmt: off
|
||||
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Execute a command defined in the project config. This CLI command is
|
||||
only called internally in auto-generated DVC pipelines, as a shortcut for
|
||||
multi-step commands in the project config. You typically shouldn't have to
|
||||
call it yourself. To run a command, call "run" or "run-all".
|
||||
"""
|
||||
project_exec(project_dir, subcommand)
|
||||
|
||||
|
||||
@project_cli.command("update-dvc")
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
||||
"run" section of the project config. This typically happens automatically
|
||||
when running a command, but can also be triggered manually if needed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
||||
else:
|
||||
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
||||
|
||||
|
||||
app.add_typer(project_cli, name="project")
|
||||
|
||||
|
||||
#################
|
||||
# CLI FUNCTIONS #
|
||||
#################
|
||||
|
||||
|
||||
def project_clone(
|
||||
name: str,
|
||||
dest: Path,
|
||||
*,
|
||||
repo: str = about.__projects__,
|
||||
git: bool = False,
|
||||
no_init: bool = False,
|
||||
) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
git (bool): Initialize project as Git repo. Should be set to True if project
|
||||
is intended as a repo, since it will allow DVC to integrate with Git.
|
||||
no_init (bool): Don't initialize DVC and Git automatically. If True, the
|
||||
"init" command or "git init" and "dvc init" need to be run manually.
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||
try:
|
||||
run_command(cmd)
|
||||
except SystemExit:
|
||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||
msg.fail(err)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(name)
|
||||
try:
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
except SystemExit:
|
||||
err = f"Could not clone '{name}' in the repo '{repo}'."
|
||||
msg.fail(err)
|
||||
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
||||
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
|
||||
for sub_dir in DIRS:
|
||||
dir_path = project_dir / sub_dir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if not no_init:
|
||||
project_init(project_dir, git=git, force=True, silent=True)
|
||||
msg.good(f"Your project is now ready!", dest)
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def project_init(
|
||||
project_dir: Path,
|
||||
*,
|
||||
git: bool = False,
|
||||
force: bool = False,
|
||||
silent: bool = False,
|
||||
analytics: bool = False,
|
||||
):
|
||||
"""Initialize a project as a DVC and (optionally) as a Git repo.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
git (bool): Also call "git init" to initialize directory as a Git repo.
|
||||
silent (bool): Don't print any output (via DVC).
|
||||
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
||||
"""
|
||||
with working_dir(project_dir) as cwd:
|
||||
if git:
|
||||
run_command(["git", "init"])
|
||||
init_cmd = ["dvc", "init"]
|
||||
if silent:
|
||||
init_cmd.append("--quiet")
|
||||
if not git:
|
||||
init_cmd.append("--no-scm")
|
||||
if force:
|
||||
init_cmd.append("--force")
|
||||
run_command(init_cmd)
|
||||
# We don't want to have analytics on by default – our users should
|
||||
# opt-in explicitly. If they want it, they can always enable it.
|
||||
if not analytics:
|
||||
run_command(["dvc", "config", "core.analytics", "false"])
|
||||
# Remove unused and confusing plot templates from .dvc directory
|
||||
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
|
||||
# once you commit your changes via Git and it creates a bunch of files
|
||||
# that have no purpose
|
||||
plots_dir = cwd / DVC_DIR / "plots"
|
||||
if plots_dir.exists():
|
||||
shutil.rmtree(str(plots_dir))
|
||||
config = load_project_config(cwd)
|
||||
setup_check_dvc(cwd, config)
|
||||
|
||||
|
||||
def project_assets(project_dir: Path) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path)
|
||||
setup_check_dvc(project_path, config)
|
||||
assets = config.get("assets", {})
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
variables = config.get("variables", {})
|
||||
fetched_assets = []
|
||||
for asset in assets:
|
||||
url = asset["url"].format(**variables)
|
||||
dest = asset["dest"].format(**variables)
|
||||
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
|
||||
if fetched_path:
|
||||
fetched_assets.append(str(fetched_path))
|
||||
if fetched_assets:
|
||||
with working_dir(project_path):
|
||||
run_command(["dvc", "add", *fetched_assets, "--external"])
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> Optional[Path]:
|
||||
"""Fetch an asset from a given URL or path. Will try to import the file
|
||||
using DVC's import-url if possible (fully tracked and versioned) and falls
|
||||
back to get-url (versioned) and a non-DVC download if necessary. If a
|
||||
checksum is provided and a local file exists, it's only re-downloaded if the
|
||||
checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
url = convert_asset_url(url)
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists() and checksum:
|
||||
# If there's already a file, check for checksum
|
||||
# TODO: add support for caches (dvc import-url with local path)
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return dest_path
|
||||
with working_dir(project_path):
|
||||
try:
|
||||
# If these fail, we don't want to output an error or info message.
|
||||
# Try with tracking the source first, then just downloading with
|
||||
# DVC, then a regular non-DVC download.
|
||||
try:
|
||||
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||
except subprocess.CalledProcessError:
|
||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||
except subprocess.CalledProcessError:
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
return None
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
||||
msg.good(f"Fetched asset {dest}")
|
||||
return dest_path
|
||||
|
||||
|
||||
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||
"""Run all commands defined in the project using DVC.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
*dvc_args: Other arguments passed to "dvc repro".
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
setup_check_dvc(project_dir, config)
|
||||
dvc_cmd = ["dvc", "repro", *dvc_args]
|
||||
with working_dir(project_dir):
|
||||
run_command(dvc_cmd)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project config.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
setup_check_dvc(project_dir, config)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
if subcommand:
|
||||
validate_subcommand(commands.keys(), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
msg.text(f"\n{help_text}\n")
|
||||
else:
|
||||
print(f"\nAvailable commands in {CONFIG_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
msg.text("Run all commands defined in the 'run' block of the project config:")
|
||||
print(f"{COMMAND} project run-all {project_dir}")
|
||||
|
||||
|
||||
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||
"""Run a named script defined in the project config. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
*dvc_args: Other arguments passed to "dvc repro".
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
setup_check_dvc(project_dir, config)
|
||||
config_commands = config.get("commands", [])
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
validate_subcommand(commands.keys(), subcommand)
|
||||
if subcommand in config.get("run", []):
|
||||
# This is one of the pipeline commands tracked in DVC
|
||||
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
||||
with working_dir(project_dir):
|
||||
run_command(dvc_cmd)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
# Deps in non-DVC commands aren't tracked, but if they're defined,
|
||||
# make sure they exist before running the command
|
||||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
msg.fail(err, exits=1)
|
||||
with working_dir(project_dir):
|
||||
run_commands(cmd["script"], variables)
|
||||
|
||||
|
||||
def project_exec(project_dir: Path, subcommand: str):
|
||||
"""Execute a command defined in the project config.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
with working_dir(project_dir):
|
||||
run_commands(commands[subcommand]["script"], variables)
|
||||
|
||||
|
||||
###########
|
||||
# HELPERS #
|
||||
###########
|
||||
|
||||
|
||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||
"""Load the project config file from a directory and validate it.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
RETURNS (Dict[str, Any]): The loaded project config.
|
||||
"""
|
||||
config_path = path / CONFIG_FILE
|
||||
if not config_path.exists():
|
||||
msg.fail("Can't find project config", config_path, exits=1)
|
||||
invalid_err = f"Invalid project config in {CONFIG_FILE}"
|
||||
try:
|
||||
config = srsly.read_yaml(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(invalid_err, e, exits=1)
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
if errors:
|
||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||
return config
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project config.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project config, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
variables = config.get("variables", {})
|
||||
commands = []
|
||||
# We only want to include commands that are part of the main list of "run"
|
||||
# commands in project.yml and should be run in sequence
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in config.get("run", []):
|
||||
validate_subcommand(config_commands.keys(), name)
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "exec", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
if verbose:
|
||||
dvc_cmd.append("--verbose")
|
||||
if silent:
|
||||
dvc_cmd.append("--quiet")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
commands.append(" ".join(full_cmd))
|
||||
with working_dir(path):
|
||||
run_commands(commands, variables, silent=True)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def ensure_dvc() -> None:
|
||||
"""Ensure that the "dvc" command is available and show an error if not."""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
||||
"""Check that the project is set up correctly with DVC and update its
|
||||
config if needed. Will raise an error if the project is not an initialized
|
||||
DVC project.
|
||||
|
||||
project_dir (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project config.
|
||||
"""
|
||||
if not project_dir.exists():
|
||||
msg.fail(f"Can't find project directory: {project_dir}")
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project.",
|
||||
f"Make sure that the project template was cloned correctly. To "
|
||||
f"initialize the project directory manually, you can run: "
|
||||
f"{COMMAND} project init {project_dir}",
|
||||
exits=1,
|
||||
)
|
||||
with msg.loading("Updating DVC config..."):
|
||||
updated = update_dvc_config(project_dir, config, silent=True)
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
silent (bool): Don't print the commands.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {' '.join(command)}")
|
||||
run_command(command)
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match("(http(s?)):\/\/github.com", url):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually and then run:",
|
||||
f"{COMMAND} project init {dest}",
|
||||
exits=1,
|
||||
)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if subcommand not in commands:
|
||||
msg.fail(
|
||||
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
|
||||
f"Available commands: {', '.join(commands)}",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||
"""Download a file using requests.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
chunk_size (int): The size of chunks to read/write.
|
||||
"""
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total = int(response.headers.get("content-length", 0))
|
||||
progress_settings = {
|
||||
"total": total,
|
||||
"unit": "iB",
|
||||
"unit_scale": True,
|
||||
"unit_divisor": chunk_size,
|
||||
"leave": False,
|
||||
}
|
||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||
for data in response.iter_content(chunk_size=chunk_size):
|
||||
size = f.write(data)
|
||||
bar.update(size)
|
0
spacy/cli/project/__init__.py
Normal file
0
spacy/cli/project/__init__.py
Normal file
154
spacy/cli/project/assets.py
Normal file
154
spacy/cli/project/assets.py
Normal file
|
@ -0,0 +1,154 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import requests
|
||||
import tqdm
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from ...util import ensure_path, get_checksum, working_dir
|
||||
from .._app import project_cli, Arg
|
||||
from .util import PROJECT_FILE, load_project_config
|
||||
|
||||
|
||||
# TODO: find a solution for caches
|
||||
# CACHES = [
|
||||
# Path.home() / ".torch",
|
||||
# Path.home() / ".caches" / "torch",
|
||||
# os.environ.get("TORCH_HOME"),
|
||||
# Path.home() / ".keras",
|
||||
# ]
|
||||
|
||||
|
||||
@project_cli.command("assets")
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
provided in the project.yml, the file is only downloaded if no local file
|
||||
with the same checksum exists.
|
||||
"""
|
||||
project_assets(project_dir)
|
||||
|
||||
|
||||
def project_assets(project_dir: Path) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path)
|
||||
assets = config.get("assets", {})
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
variables = config.get("variables", {})
|
||||
for asset in assets:
|
||||
dest = asset["dest"].format(**variables)
|
||||
url = asset.get("url")
|
||||
checksum = asset.get("checksum")
|
||||
if not url:
|
||||
# project.yml defines asset without URL that the user has to place
|
||||
check_private_asset(dest, checksum)
|
||||
continue
|
||||
url = url.format(**variables)
|
||||
fetch_asset(project_path, url, dest, checksum)
|
||||
|
||||
|
||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||
"""Check and validate assets without a URL (private assets that the user
|
||||
has to provide themselves) and give feedback about the checksum.
|
||||
|
||||
dest (Path): Desintation path of the asset.
|
||||
checksum (Optional[str]): Optional checksum of the expected file.
|
||||
"""
|
||||
if not Path(dest).exists():
|
||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||
msg.warn(err)
|
||||
else:
|
||||
if checksum and checksum == get_checksum(dest):
|
||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||
else:
|
||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> None:
|
||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
# TODO: add support for caches
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists() and checksum:
|
||||
# If there's already a file, check for checksum
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return dest_path
|
||||
with working_dir(project_path):
|
||||
url = convert_asset_url(url)
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
if Path(url).exists() and Path(url).is_file():
|
||||
# If it's a local file, copy to destination
|
||||
shutil.copy(url, str(dest_path))
|
||||
msg.good(f"Copied local asset {dest}")
|
||||
else:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
return
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||
"""Download a file using requests.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
chunk_size (int): The size of chunks to read/write.
|
||||
"""
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total = int(response.headers.get("content-length", 0))
|
||||
progress_settings = {
|
||||
"total": total,
|
||||
"unit": "iB",
|
||||
"unit_scale": True,
|
||||
"unit_divisor": chunk_size,
|
||||
"leave": False,
|
||||
}
|
||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||
for data in response.iter_content(chunk_size=chunk_size):
|
||||
size = f.write(data)
|
||||
bar.update(size)
|
110
spacy/cli/project/clone.py
Normal file
110
spacy/cli/project/clone.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path, run_command, make_tempdir
|
||||
from .._app import project_cli, Arg, Opt, COMMAND
|
||||
|
||||
|
||||
DIRS = [
|
||||
"assets",
|
||||
"metas",
|
||||
"configs",
|
||||
"packages",
|
||||
"metrics",
|
||||
"scripts",
|
||||
"notebooks",
|
||||
"training",
|
||||
"corpus",
|
||||
]
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to fetch"),
|
||||
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo). Setting the --git flag will also
|
||||
initialize the project directory as a Git repo. If the project is intended
|
||||
to be a Git repo, it should be initialized with Git first, before
|
||||
initializing DVC (Data Version Control). This allows DVC to integrate with
|
||||
Git.
|
||||
"""
|
||||
if dest == Path.cwd():
|
||||
dest = dest / name
|
||||
project_clone(name, dest, repo=repo)
|
||||
|
||||
|
||||
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||
try:
|
||||
run_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||
msg.fail(err)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(name)
|
||||
try:
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' in the repo '{repo}'."
|
||||
msg.fail(err)
|
||||
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
||||
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
|
||||
for sub_dir in DIRS:
|
||||
dir_path = project_dir / sub_dir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
msg.good(f"Your project is now ready!", dest)
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually and then run:",
|
||||
f"{COMMAND} project init {dest}",
|
||||
exits=1,
|
||||
)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
exits=1,
|
||||
)
|
206
spacy/cli/project/dvc.py
Normal file
206
spacy/cli/project/dvc.py
Normal file
|
@ -0,0 +1,206 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .util import PROJECT_FILE, load_project_config
|
||||
from .._app import project_cli, Arg, Opt, NAME, COMMAND
|
||||
from ...util import get_hash, working_dir, split_command, join_command, run_command
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
UPDATE_COMMAND = "dvc"
|
||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||
|
||||
|
||||
@project_cli.command(UPDATE_COMMAND)
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
project_dir: Path,
|
||||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. Will only update the file if the checksum changed.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||
else:
|
||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
ensure_dvc(path)
|
||||
workflows = config.get("workflows", {})
|
||||
workflow_names = list(workflows.keys())
|
||||
check_workflows(workflow_names, workflow)
|
||||
if not workflow:
|
||||
workflow = workflow_names[0]
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
variables = config.get("variables", {})
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, str] = {},
|
||||
flags: Dict[str, bool] = {},
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
||||
workflows (List[str]): Names of the available workflows.
|
||||
workflow (Optional[str]): The name of the workflow to convert.
|
||||
"""
|
||||
if not workflows:
|
||||
msg.fail(
|
||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||
f"define at least one list of commands.",
|
||||
exits=1,
|
||||
)
|
||||
if workflow is not None and workflow not in workflows:
|
||||
msg.fail(
|
||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||
f"Available workflows: {', '.join(workflows)}",
|
||||
exits=1,
|
||||
)
|
||||
if not workflow:
|
||||
msg.warn(
|
||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||
)
|
||||
|
||||
|
||||
def ensure_dvc(project_dir: Path) -> None:
|
||||
"""Ensure that the "dvc" command is available and that the current project
|
||||
directory is an initialized DVC project.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||
"to be installed and the 'dvc' command needs to be available",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project",
|
||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||
"directory. For more details, see the documentation: "
|
||||
"https://dvc.org/doc/command-reference/init",
|
||||
exits=1,
|
||||
)
|
250
spacy/cli/project/run.py
Normal file
250
spacy/cli/project/run.py
Normal file
|
@ -0,0 +1,250 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import sys
|
||||
import srsly
|
||||
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, get_checksum
|
||||
from ...util import get_hash, join_command
|
||||
from .._app import project_cli, Arg, Opt, COMMAND
|
||||
from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named script or workflow defined in the project.yml. If a workflow
|
||||
name is specified, all commands in the workflow are run, in order. If
|
||||
commands define inputs and/or outputs, they will only be re-run if state
|
||||
has changed.
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry)
|
||||
|
||||
|
||||
def project_run(
|
||||
project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
force (bool): Force re-running, even if nothing changed.
|
||||
dry (bool): Perform a dry run and don't execute commands.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for cmd in workflows[subcommand]:
|
||||
project_run(project_dir, cmd, force=force, dry=dry)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
variables = config.get("variables", {})
|
||||
for dep in cmd.get("deps", []):
|
||||
dep = dep.format(**variables)
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
rerun = check_rerun(current_dir, cmd, variables)
|
||||
if not rerun and not force:
|
||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||
else:
|
||||
msg.divider(subcommand)
|
||||
run_commands(cmd["script"], variables, dry=dry)
|
||||
update_lockfile(current_dir, cmd, variables)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||
if subcommand:
|
||||
validate_subcommand(commands.keys(), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
msg.text(f"\n{help_text}\n")
|
||||
else:
|
||||
print(f"\nAvailable commands in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
|
||||
print(f"{COMMAND} project run {project_loc}")
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, Any] = {},
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
silent (bool): Don't print the commands.
|
||||
dry (bool): Perform a dry run and don't execut anything.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||
) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if not commands and not workflows:
|
||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||
if subcommand not in commands and subcommand not in workflows:
|
||||
help_msg = []
|
||||
if commands:
|
||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||
if workflows:
|
||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||
msg.fail(
|
||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||
". ".join(help_msg),
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def check_rerun(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> bool:
|
||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||
changed.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (bool): Whether to re-run the command.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||
return True
|
||||
data = srsly.read_yaml(lock_path)
|
||||
if command["name"] not in data: # We don't have info about this command
|
||||
return True
|
||||
entry = data[command["name"]]
|
||||
# If the entry in the lockfile matches the lockfile entry that would be
|
||||
# generated from the current command, we don't rerun because it means that
|
||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
||||
|
||||
|
||||
def update_lockfile(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> None:
|
||||
"""Update the lockfile after running a command. Will create a lockfile if
|
||||
it doesn't yet exist and will add an entry for the current command, its
|
||||
script and dependencies/outputs.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists():
|
||||
srsly.write_yaml(lock_path, {})
|
||||
data = {}
|
||||
else:
|
||||
data = srsly.read_yaml(lock_path)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
||||
srsly.write_yaml(lock_path, data)
|
||||
|
||||
|
||||
def get_lock_entry(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||
the script (command steps) and a list of dependencies and outputs with
|
||||
their paths and file hashes, if available. The format is based on the
|
||||
dvc.lock files, to keep things consistent.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||
"""
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
||||
return {
|
||||
"cmd": f"{COMMAND} run {command['name']}",
|
||||
"script": command["script"],
|
||||
"deps": deps,
|
||||
"outs": [*outs, *outs_nc],
|
||||
}
|
||||
|
||||
|
||||
def get_fileinfo(
|
||||
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
||||
) -> List[Dict[str, str]]:
|
||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||
Includes the file path and the file's checksum.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
paths (List[str]): The file paths.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||
"""
|
||||
data = []
|
||||
for path in paths:
|
||||
path = path.format(**variables)
|
||||
file_path = project_dir / path
|
||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
return data
|
57
spacy/cli/project/util.py
Normal file
57
spacy/cli/project/util.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
from typing import Dict, Any
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ...schemas import ProjectConfigSchema, validate
|
||||
|
||||
|
||||
PROJECT_FILE = "project.yml"
|
||||
PROJECT_LOCK = "project.lock"
|
||||
|
||||
|
||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||
"""Load the project.yml file from a directory and validate it.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||
"""
|
||||
config_path = path / PROJECT_FILE
|
||||
if not config_path.exists():
|
||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||
try:
|
||||
config = srsly.read_yaml(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(invalid_err, e, exits=1)
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
if errors:
|
||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||
validate_project_commands(config)
|
||||
return config
|
||||
|
||||
|
||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||
"""Check that project commands and workflows are valid, don't contain
|
||||
duplicates, don't clash and only refer to commands that exist.
|
||||
|
||||
config (Dict[str, Any]): The loaded config.
|
||||
"""
|
||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||
workflows = config.get("workflows", {})
|
||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||
if duplicates:
|
||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||
msg.fail(err, exits=1)
|
||||
for workflow_name, workflow_steps in workflows.items():
|
||||
if workflow_name in command_names:
|
||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||
msg.fail(err, exits=1)
|
||||
for step in workflow_steps:
|
||||
if step not in command_names:
|
||||
msg.fail(
|
||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||
f"Workflows can only refer to commands defined in the 'commands' "
|
||||
f"section of the {PROJECT_FILE}.",
|
||||
exits=1,
|
||||
)
|
|
@ -203,7 +203,8 @@ def train(
|
|||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp, shuffle=False, gold_preproc=training["gold_preproc"]
|
||||
nlp, shuffle=False, gold_preproc=training["gold_preproc"],
|
||||
max_length=training["max_length"]
|
||||
)
|
||||
)
|
||||
nlp.begin_training(lambda: train_examples)
|
||||
|
@ -306,11 +307,18 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
if len(train_examples) == 0:
|
||||
raise ValueError(Errors.E988)
|
||||
epoch += 1
|
||||
batches = util.minibatch_by_words(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
discard_oversize=cfg["discard_oversize"],
|
||||
)
|
||||
if cfg.get("batch_by_words", True):
|
||||
batches = util.minibatch_by_words(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
discard_oversize=cfg["discard_oversize"],
|
||||
)
|
||||
else:
|
||||
batches = util.minibatch(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
)
|
||||
|
||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||
try:
|
||||
first = next(batches)
|
||||
|
|
|
@ -477,15 +477,14 @@ class Errors(object):
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||
"array and {doc_length} for the Doc itself.")
|
||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||
E973 = ("Unexpected type for NER data")
|
||||
E974 = ("Unknown {obj} attribute: {key}")
|
||||
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
|
||||
"but got {type}")
|
||||
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
|
||||
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||
"but received None.")
|
||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .corpus import Corpus
|
||||
from .example import Example
|
||||
from .align import align
|
||||
from .align import Alignment
|
||||
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob
|
||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
cdef class Alignment:
|
||||
cdef public object cost
|
||||
cdef public object i2j
|
||||
cdef public object j2i
|
||||
cdef public object i2j_multi
|
||||
cdef public object j2i_multi
|
||||
cdef public object cand_to_gold
|
||||
cdef public object gold_to_cand
|
30
spacy/gold/align.py
Normal file
30
spacy/gold/align.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
from typing import List
|
||||
import numpy
|
||||
from thinc.types import Ragged
|
||||
from dataclasses import dataclass
|
||||
import tokenizations
|
||||
|
||||
|
||||
@dataclass
|
||||
class Alignment:
|
||||
x2y: Ragged
|
||||
y2x: Ragged
|
||||
|
||||
@classmethod
|
||||
def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
|
||||
x2y = _make_ragged(x2y)
|
||||
y2x = _make_ragged(y2x)
|
||||
return Alignment(x2y=x2y, y2x=y2x)
|
||||
|
||||
@classmethod
|
||||
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
||||
x2y, y2x = tokenizations.get_alignments(A, B)
|
||||
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
||||
|
||||
|
||||
def _make_ragged(indices):
|
||||
lengths = numpy.array([len(x) for x in indices], dtype="i")
|
||||
flat = []
|
||||
for x in indices:
|
||||
flat.extend(x)
|
||||
return Ragged(numpy.array(flat, dtype="i"), lengths)
|
|
@ -1,101 +0,0 @@
|
|||
import numpy
|
||||
from ..errors import Errors, AlignmentError
|
||||
|
||||
|
||||
cdef class Alignment:
|
||||
def __init__(self, spacy_words, gold_words):
|
||||
# Do many-to-one alignment for misaligned tokens.
|
||||
# If we over-segment, we'll have one gold word that covers a sequence
|
||||
# of predicted words
|
||||
# If we under-segment, we'll have one predicted word that covers a
|
||||
# sequence of gold words.
|
||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||
# a sequence of gold words. That's many-to-many -- we don't do that
|
||||
# except for NER spans where the start and end can be aligned.
|
||||
cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
|
||||
self.cost = cost
|
||||
self.i2j = i2j
|
||||
self.j2i = j2i
|
||||
self.i2j_multi = i2j_multi
|
||||
self.j2i_multi = j2i_multi
|
||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||
|
||||
|
||||
def align(tokens_a, tokens_b):
|
||||
"""Calculate alignment tables between two tokenizations.
|
||||
|
||||
tokens_a (List[str]): The candidate tokenization.
|
||||
tokens_b (List[str]): The reference tokenization.
|
||||
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
||||
* cost (int): The number of misaligned tokens.
|
||||
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
||||
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
||||
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
||||
it has the value -1.
|
||||
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
||||
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
||||
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
||||
the same token of `tokens_b`.
|
||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||
direction.
|
||||
"""
|
||||
tokens_a = _normalize_for_alignment(tokens_a)
|
||||
tokens_b = _normalize_for_alignment(tokens_b)
|
||||
cost = 0
|
||||
a2b = numpy.empty(len(tokens_a), dtype="i")
|
||||
b2a = numpy.empty(len(tokens_b), dtype="i")
|
||||
a2b.fill(-1)
|
||||
b2a.fill(-1)
|
||||
a2b_multi = {}
|
||||
b2a_multi = {}
|
||||
i = 0
|
||||
j = 0
|
||||
offset_a = 0
|
||||
offset_b = 0
|
||||
while i < len(tokens_a) and j < len(tokens_b):
|
||||
a = tokens_a[i][offset_a:]
|
||||
b = tokens_b[j][offset_b:]
|
||||
if a == b:
|
||||
if offset_a == offset_b == 0:
|
||||
a2b[i] = j
|
||||
b2a[j] = i
|
||||
elif offset_a == 0:
|
||||
cost += 2
|
||||
a2b_multi[i] = j
|
||||
elif offset_b == 0:
|
||||
cost += 2
|
||||
b2a_multi[j] = i
|
||||
offset_a = offset_b = 0
|
||||
i += 1
|
||||
j += 1
|
||||
elif a == "":
|
||||
assert offset_a == 0
|
||||
cost += 1
|
||||
i += 1
|
||||
elif b == "":
|
||||
assert offset_b == 0
|
||||
cost += 1
|
||||
j += 1
|
||||
elif b.startswith(a):
|
||||
cost += 1
|
||||
if offset_a == 0:
|
||||
a2b_multi[i] = j
|
||||
i += 1
|
||||
offset_a = 0
|
||||
offset_b += len(a)
|
||||
elif a.startswith(b):
|
||||
cost += 1
|
||||
if offset_b == 0:
|
||||
b2a_multi[j] = i
|
||||
j += 1
|
||||
offset_b = 0
|
||||
offset_a += len(b)
|
||||
else:
|
||||
assert "".join(tokens_a) != "".join(tokens_b)
|
||||
raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
|
||||
return cost, a2b, b2a, a2b_multi, b2a_multi
|
||||
|
||||
|
||||
def _normalize_for_alignment(tokens):
|
||||
return [w.replace(" ", "").lower() for w in tokens]
|
|
@ -1,8 +1,7 @@
|
|||
from ..tokens.doc cimport Doc
|
||||
from .align cimport Alignment
|
||||
|
||||
|
||||
cdef class Example:
|
||||
cdef readonly Doc x
|
||||
cdef readonly Doc y
|
||||
cdef readonly Alignment _alignment
|
||||
cdef readonly object _alignment
|
||||
|
|
|
@ -6,10 +6,9 @@ from ..tokens.doc cimport Doc
|
|||
from ..tokens.span cimport Span
|
||||
from ..tokens.span import Span
|
||||
from ..attrs import IDS
|
||||
from .align cimport Alignment
|
||||
from .align import Alignment
|
||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||
from .iob_utils import spans_from_biluo_tags
|
||||
from .align import Alignment
|
||||
from ..errors import Errors, Warnings
|
||||
from ..syntax import nonproj
|
||||
|
||||
|
@ -28,8 +27,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
|||
|
||||
|
||||
cdef class Example:
|
||||
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
|
||||
""" Doc can either be text, or an actual Doc """
|
||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||
if predicted is None:
|
||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||
if reference is None:
|
||||
|
@ -60,17 +58,15 @@ cdef class Example:
|
|||
|
||||
@classmethod
|
||||
def from_dict(cls, Doc predicted, dict example_dict):
|
||||
if predicted is None:
|
||||
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||
if example_dict is None:
|
||||
raise ValueError(Errors.E976)
|
||||
if not isinstance(predicted, Doc):
|
||||
raise TypeError(Errors.E975.format(type=type(predicted)))
|
||||
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||
example_dict = _fix_legacy_dict_data(example_dict)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
if "ORTH" not in tok_dict:
|
||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||
if not _has_field(tok_dict, "SPACY"):
|
||||
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
|
||||
return Example(
|
||||
predicted,
|
||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||
|
@ -83,34 +79,38 @@ cdef class Example:
|
|||
gold_words = [token.orth_ for token in self.reference]
|
||||
if gold_words == []:
|
||||
gold_words = spacy_words
|
||||
self._alignment = Alignment(spacy_words, gold_words)
|
||||
self._alignment = Alignment.from_strings(spacy_words, gold_words)
|
||||
return self._alignment
|
||||
|
||||
def get_aligned(self, field, as_string=False):
|
||||
"""Return an aligned array for a token attribute."""
|
||||
i2j_multi = self.alignment.i2j_multi
|
||||
cand_to_gold = self.alignment.cand_to_gold
|
||||
align = self.alignment.x2y
|
||||
|
||||
vocab = self.reference.vocab
|
||||
gold_values = self.reference.to_array([field])
|
||||
output = [None] * len(self.predicted)
|
||||
for i, gold_i in enumerate(cand_to_gold):
|
||||
if self.predicted[i].text.isspace():
|
||||
output[i] = None
|
||||
if gold_i is None:
|
||||
if i in i2j_multi:
|
||||
output[i] = gold_values[i2j_multi[i]]
|
||||
else:
|
||||
output[i] = None
|
||||
for token in self.predicted:
|
||||
if token.is_space:
|
||||
output[token.i] = None
|
||||
else:
|
||||
output[i] = gold_values[gold_i]
|
||||
values = gold_values[align[token.i].dataXd]
|
||||
values = values.ravel()
|
||||
if len(values) == 0:
|
||||
output[token.i] = None
|
||||
elif len(values) == 1:
|
||||
output[token.i] = values[0]
|
||||
elif len(set(list(values))) == 1:
|
||||
# If all aligned tokens have the same value, use it.
|
||||
output[token.i] = values[0]
|
||||
else:
|
||||
output[token.i] = None
|
||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||
return output
|
||||
|
||||
def get_aligned_parse(self, projectivize=True):
|
||||
cand_to_gold = self.alignment.cand_to_gold
|
||||
gold_to_cand = self.alignment.gold_to_cand
|
||||
cand_to_gold = self.alignment.x2y
|
||||
gold_to_cand = self.alignment.y2x
|
||||
aligned_heads = [None] * self.x.length
|
||||
aligned_deps = [None] * self.x.length
|
||||
heads = [token.head.i for token in self.y]
|
||||
|
@ -118,52 +118,51 @@ cdef class Example:
|
|||
if projectivize:
|
||||
heads, deps = nonproj.projectivize(heads, deps)
|
||||
for cand_i in range(self.x.length):
|
||||
gold_i = cand_to_gold[cand_i]
|
||||
if gold_i is not None: # Alignment found
|
||||
gold_head = gold_to_cand[heads[gold_i]]
|
||||
if gold_head is not None:
|
||||
aligned_heads[cand_i] = gold_head
|
||||
if cand_to_gold.lengths[cand_i] == 1:
|
||||
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
||||
if gold_to_cand.lengths[heads[gold_i]] == 1:
|
||||
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
|
||||
aligned_deps[cand_i] = deps[gold_i]
|
||||
return aligned_heads, aligned_deps
|
||||
|
||||
def get_aligned_spans_x2y(self, x_spans):
|
||||
return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
|
||||
|
||||
def get_aligned_spans_y2x(self, y_spans):
|
||||
return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
|
||||
|
||||
def _get_aligned_spans(self, doc, spans, align):
|
||||
seen = set()
|
||||
output = []
|
||||
for span in spans:
|
||||
indices = align[span.start : span.end].data.ravel()
|
||||
indices = [idx for idx in indices if idx not in seen]
|
||||
if len(indices) >= 1:
|
||||
aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
|
||||
target_text = span.text.lower().strip().replace(" ", "")
|
||||
our_text = aligned_span.text.lower().strip().replace(" ", "")
|
||||
if our_text == target_text:
|
||||
output.append(aligned_span)
|
||||
seen.update(indices)
|
||||
return output
|
||||
|
||||
def get_aligned_ner(self):
|
||||
if not self.y.is_nered:
|
||||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
||||
x_text = self.x.text
|
||||
# Get a list of entities, and make spans for non-entity tokens.
|
||||
# We then work through the spans in order, trying to find them in
|
||||
# the text and using that to get the offset. Any token that doesn't
|
||||
# get a tag set this way is tagged None.
|
||||
# This could maybe be improved? It at least feels easy to reason about.
|
||||
y_spans = list(self.y.ents)
|
||||
y_spans.sort()
|
||||
x_text_offset = 0
|
||||
x_spans = []
|
||||
for y_span in y_spans:
|
||||
if x_text.count(y_span.text) >= 1:
|
||||
start_char = x_text.index(y_span.text) + x_text_offset
|
||||
end_char = start_char + len(y_span.text)
|
||||
x_span = self.x.char_span(start_char, end_char, label=y_span.label)
|
||||
if x_span is not None:
|
||||
x_spans.append(x_span)
|
||||
x_text = self.x.text[end_char:]
|
||||
x_text_offset = end_char
|
||||
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
||||
# Default to 'None' for missing values
|
||||
x_tags = biluo_tags_from_offsets(
|
||||
self.x,
|
||||
[(e.start_char, e.end_char, e.label_) for e in x_spans],
|
||||
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||
missing=None
|
||||
)
|
||||
gold_to_cand = self.alignment.gold_to_cand
|
||||
for token in self.y:
|
||||
if token.ent_iob_ == "O":
|
||||
cand_i = gold_to_cand[token.i]
|
||||
if cand_i is not None and x_tags[cand_i] is None:
|
||||
x_tags[cand_i] = "O"
|
||||
i2j_multi = self.alignment.i2j_multi
|
||||
for i, tag in enumerate(x_tags):
|
||||
if tag is None and i in i2j_multi:
|
||||
gold_i = i2j_multi[i]
|
||||
if gold_i is not None and self.y[gold_i].ent_iob_ == "O":
|
||||
# Now fill the tokens we can align to O.
|
||||
O = 2 # I=1, O=2, B=3
|
||||
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||
if x_tags[i] is None:
|
||||
if ent_iob == O:
|
||||
x_tags[i] = "O"
|
||||
elif self.x[i].is_space:
|
||||
x_tags[i] = "O"
|
||||
return x_tags
|
||||
|
||||
|
@ -194,25 +193,22 @@ cdef class Example:
|
|||
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
|
||||
return links
|
||||
|
||||
|
||||
def split_sents(self):
|
||||
""" Split the token annotations into multiple Examples based on
|
||||
sent_starts and return a list of the new Examples"""
|
||||
if not self.reference.is_sentenced:
|
||||
return [self]
|
||||
|
||||
sent_starts = self.get_aligned("SENT_START")
|
||||
sent_starts.append(1) # appending virtual start of a next sentence to facilitate search
|
||||
|
||||
|
||||
align = self.alignment.y2x
|
||||
seen_indices = set()
|
||||
output = []
|
||||
pred_start = 0
|
||||
for sent in self.reference.sents:
|
||||
new_ref = sent.as_doc()
|
||||
pred_end = sent_starts.index(1, pred_start+1) # find where the next sentence starts
|
||||
new_pred = self.predicted[pred_start : pred_end].as_doc()
|
||||
output.append(Example(new_pred, new_ref))
|
||||
pred_start = pred_end
|
||||
|
||||
for y_sent in self.reference.sents:
|
||||
indices = align[y_sent.start : y_sent.end].data.ravel()
|
||||
indices = [idx for idx in indices if idx not in seen_indices]
|
||||
if indices:
|
||||
x_sent = self.predicted[indices[0] : indices[-1] + 1]
|
||||
output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
|
||||
seen_indices.update(indices)
|
||||
return output
|
||||
|
||||
property text:
|
||||
|
@ -258,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
values.append([vocab.morphology.add(v) for v in value])
|
||||
else:
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
try:
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
except TypeError:
|
||||
types= set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
return attrs, array.T
|
||||
|
|
|
@ -540,19 +540,15 @@ class Language(object):
|
|||
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
component_deps = count_pipeline_interdependencies(self.pipeline)
|
||||
# Determine whether component should set annotations. In theory I guess
|
||||
# we should do this by inspecting the meta? Or we could just always
|
||||
# say "yes"
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
component_cfg.setdefault(name, {})
|
||||
component_cfg[name].setdefault("drop", drop)
|
||||
component_cfg[name]["set_annotations"] = bool(component_deps[i])
|
||||
component_cfg[name].setdefault("set_annotations", False)
|
||||
for name, proc in self.pipeline:
|
||||
if not hasattr(proc, "update"):
|
||||
continue
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||
if sgd is not False:
|
||||
if sgd not in (None, False):
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "model"):
|
||||
proc.model.finish_update(sgd)
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from thinc.api import Model, normal_init
|
||||
|
||||
|
||||
def PrecomputableAffine(nO, nI, nF, nP):
|
||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||
model = Model(
|
||||
"precomputable_affine",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||
params={"W": None, "b": None, "pad": None},
|
||||
attrs={"dropout_rate": dropout}
|
||||
)
|
||||
return model
|
||||
|
||||
|
@ -48,17 +49,14 @@ def forward(model, X, is_train):
|
|||
model.inc_grad("b", dY.sum(axis=0))
|
||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||
|
||||
Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
|
||||
Wopfi = W.transpose((1, 2, 0, 3))
|
||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||
|
||||
# Reuse the buffer
|
||||
dWopfi = Wopfi
|
||||
dWopfi.fill(0.0)
|
||||
model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
|
||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||
model.inc_grad("W", dWopfi)
|
||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||
|
||||
|
|
|
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
|
|||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
norm = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
|
||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
|
||||
seed=0
|
||||
)
|
||||
if subword_features:
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
|
||||
seed=1
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
|
||||
seed=2
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
|
||||
seed=3
|
||||
)
|
||||
else:
|
||||
|
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
|
|||
>> Maxout(
|
||||
nO=width,
|
||||
nI=width * columns,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
|
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
|
|||
>> Maxout(
|
||||
nO=width,
|
||||
nI=width * columns,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
|
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
|
|||
>> Maxout(
|
||||
nO=width,
|
||||
nI=width * columns,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
|
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
|
|||
reduce_dimensions = Maxout(
|
||||
nO=width,
|
||||
nI=nM * nC + width,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
)
|
||||
|
|
|
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
|
|||
from ..syntax._parser_model import ParserStepModel
|
||||
|
||||
|
||||
def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
||||
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
||||
"""Set up a stepwise transition-based model"""
|
||||
if upper is None:
|
||||
has_upper = False
|
||||
|
|
|
@ -272,7 +272,7 @@ cdef class Morphology:
|
|||
|
||||
@staticmethod
|
||||
def feats_to_dict(feats):
|
||||
if not feats:
|
||||
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||
return {}
|
||||
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
||||
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
||||
|
|
|
@ -3,7 +3,7 @@ cimport numpy as np
|
|||
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import to_categorical
|
||||
from thinc.api import SequenceCategoricalCrossentropy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..vocab cimport Vocab
|
||||
|
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
|
|||
doc.is_morphed = True
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||
for i in range(len(morphs)):
|
||||
|
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
|
|||
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
||||
if morph == "":
|
||||
morph = Morphology.EMPTY_MORPH
|
||||
if morph is None:
|
||||
correct[idx] = guesses[idx]
|
||||
elif morph in tag_index:
|
||||
correct[idx] = tag_index[morph]
|
||||
else:
|
||||
correct[idx] = 0
|
||||
known_labels[idx] = 0.
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
d_scores *= self.model.ops.asarray(known_labels)
|
||||
loss = (d_scores**2).sum()
|
||||
docs = [eg.predicted for eg in examples]
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
eg_truths.append(morph)
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
|
|
|
@ -334,7 +334,7 @@ class Tagger(Pipe):
|
|||
losses[self.name] += (gradient**2).sum()
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
|
@ -521,29 +521,23 @@ class SentenceRecognizer(Tagger):
|
|||
doc.c[j].sent_start = -1
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = range(len(self.labels))
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||
labels = self.labels
|
||||
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
sent_starts = eg.get_aligned("sent_start")
|
||||
for sent_start in sent_starts:
|
||||
if sent_start is None:
|
||||
correct[idx] = guesses[idx]
|
||||
elif sent_start in tag_index:
|
||||
correct[idx] = sent_start
|
||||
eg_truth = []
|
||||
for x in eg.get_aligned("sent_start"):
|
||||
if x == None:
|
||||
eg_truth.append(None)
|
||||
elif x == 1:
|
||||
eg_truth.append(labels[1])
|
||||
else:
|
||||
correct[idx] = 0
|
||||
known_labels[idx] = 0.
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
d_scores *= self.model.ops.asarray(known_labels)
|
||||
loss = (d_scores**2).sum()
|
||||
docs = [eg.predicted for eg in examples]
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
# anything other than 1: 0, -1, -1 as uint64
|
||||
eg_truth.append(labels[0])
|
||||
truths.append(eg_truth)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||
|
|
|
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
|
|||
class ProjectConfigAsset(BaseModel):
|
||||
# fmt: off
|
||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||
url: StrictStr = Field(..., title="URL of asset")
|
||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
# fmt: on
|
||||
|
||||
|
@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel):
|
|||
# fmt: off
|
||||
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
|
||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||
# fmt: on
|
||||
|
||||
|
|
|
@ -326,10 +326,11 @@ class Scorer(object):
|
|||
for token in doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
gold_i = align.cand_to_gold[token.i]
|
||||
if gold_i is None:
|
||||
if align.x2y.lengths[token.i] != 1:
|
||||
self.tokens.fp += 1
|
||||
gold_i = None
|
||||
else:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
self.tokens.tp += 1
|
||||
cand_tags.add((gold_i, token.tag_))
|
||||
cand_pos.add((gold_i, token.pos_))
|
||||
|
@ -345,7 +346,10 @@ class Scorer(object):
|
|||
if token.is_sent_start:
|
||||
cand_sent_starts.add(gold_i)
|
||||
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
||||
gold_head = align.cand_to_gold[token.head.i]
|
||||
if align.x2y.lengths[token.head.i] == 1:
|
||||
gold_head = align.x2y[token.head.i].dataXd[0, 0]
|
||||
else:
|
||||
gold_head = None
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
# Multiple (None, None) deps are possible
|
||||
if gold_i is None or gold_head is None:
|
||||
|
@ -381,15 +385,9 @@ class Scorer(object):
|
|||
gold_ents.add(gold_ent)
|
||||
gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||
for ent in doc.ents:
|
||||
first = align.cand_to_gold[ent.start]
|
||||
last = align.cand_to_gold[ent.end - 1]
|
||||
if first is None or last is None:
|
||||
self.ner.fp += 1
|
||||
self.ner_per_ents[ent.label_].fp += 1
|
||||
else:
|
||||
cand_ents.add((ent.label_, first, last))
|
||||
cand_per_ents[ent.label_].add((ent.label_, first, last))
|
||||
for ent in example.get_aligned_spans_x2y(doc.ents):
|
||||
cand_ents.add((ent.label_, ent.start, ent.end - 1))
|
||||
cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||
# Scores per ent
|
||||
for k, v in self.ner_per_ents.items():
|
||||
if k in cand_per_ents:
|
||||
|
|
|
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
|||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||
dropout=0.1):
|
||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||
self.attrs["has_upper"] = has_upper
|
||||
self.attrs["dropout_rate"] = dropout
|
||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||
if layers[1].get_dim("nP") >= 2:
|
||||
activation = "maxout"
|
||||
|
@ -289,11 +291,17 @@ class ParserStepModel(Model):
|
|||
self.bp_tokvecs(d_tokvecs[:-1])
|
||||
return d_tokvecs
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
def step_forward(model: ParserStepModel, states, is_train):
|
||||
token_ids = model.get_token_ids(states)
|
||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||
mask = None
|
||||
if model.attrs["has_upper"]:
|
||||
dropout_rate = model.attrs["dropout_rate"]
|
||||
if is_train and dropout_rate > 0:
|
||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||
vector *= mask
|
||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
|
@ -305,6 +313,8 @@ def step_forward(model: ParserStepModel, states, is_train):
|
|||
# Zero vectors for unseen classes
|
||||
d_scores *= model._class_mask
|
||||
d_vector = get_d_vector(d_scores)
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
if isinstance(model.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
|
@ -437,7 +447,7 @@ cdef class precompute_hiddens:
|
|||
sum_state_features(<float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector = state_vector + self.bias
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector_ids):
|
||||
|
|
|
@ -65,7 +65,6 @@ cdef class Parser:
|
|||
self.set_output(self.moves.n_moves)
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
||||
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
|
||||
self._multitasks = []
|
||||
for multitask in cfg.get("multitasks", []):
|
||||
self.add_multitask_objective(multitask)
|
||||
|
@ -280,11 +279,12 @@ cdef class Parser:
|
|||
[eg.predicted for eg in examples])
|
||||
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
||||
# Chop sequences into lengths of this many transitions, to make the
|
||||
# batch uniform length. We randomize this to overfit less.
|
||||
# batch uniform length.
|
||||
# We used to randomize this, but it's not clear that actually helps?
|
||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
||||
states, golds, max_steps = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=numpy.random.choice(range(5, cut_size))
|
||||
max_length=cut_size
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
|
@ -292,24 +292,15 @@ cdef class Parser:
|
|||
if not states:
|
||||
return losses
|
||||
all_states = list(states)
|
||||
states_golds = zip(states, golds)
|
||||
for _ in range(max_steps):
|
||||
if not states_golds:
|
||||
break
|
||||
states_golds = list(zip(states, golds))
|
||||
while states_golds:
|
||||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||
if self.cfg["normalize_gradients_with_batch_size"]:
|
||||
# We have to be very careful how we do this, because of the way we
|
||||
# cut up the batch. We subdivide long sequences. If we normalize
|
||||
# naively, we end up normalizing by sequence length, which
|
||||
# is bad: that would mean that states in long sequences
|
||||
# consistently get smaller gradients. Imagine if we have two
|
||||
# sequences, one length 1000, one length 20. If we cut up
|
||||
# the 1k sequence so that we have a "batch" of 50 subsequences,
|
||||
# we don't want the gradients to get 50 times smaller!
|
||||
d_scores /= n_examples
|
||||
|
||||
# Note that the gradient isn't normalized by the batch size
|
||||
# here, because our "samples" are really the states...But we
|
||||
# can't normalize by the number of states either, as then we'd
|
||||
# be getting smaller gradients for states in long sequences.
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
|
@ -407,6 +398,7 @@ cdef class Parser:
|
|||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
|
@ -525,21 +517,25 @@ cdef class Parser:
|
|||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||
states = []
|
||||
golds = []
|
||||
kept = []
|
||||
max_length_seen = 0
|
||||
for state, eg in zip(all_states, examples):
|
||||
if self.moves.has_gold(eg) and not state.is_final():
|
||||
gold = self.moves.init_gold(state, eg)
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
kept.append((eg, state, gold, oracle_actions))
|
||||
min_length = min(min_length, len(oracle_actions))
|
||||
max_length_seen = max(max_length, len(oracle_actions))
|
||||
if len(eg.x) < max_length:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
else:
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
kept.append((eg, state, gold, oracle_actions))
|
||||
min_length = min(min_length, len(oracle_actions))
|
||||
max_length_seen = max(max_length, len(oracle_actions))
|
||||
if not kept:
|
||||
return [], [], 0
|
||||
return states, golds, 0
|
||||
max_length = max(min_length, min(max_length, max_length_seen))
|
||||
states = []
|
||||
golds = []
|
||||
cdef int clas
|
||||
max_moves = 0
|
||||
for eg, state, gold, oracle_actions in kept:
|
||||
|
|
|
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
|||
|
||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||
assert contains_cycle(tree) is None
|
||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
||||
assert contains_cycle(cyclic_tree) == {3, 4, 5}
|
||||
assert contains_cycle(partial_tree) is None
|
||||
assert contains_cycle(multirooted_tree) is None
|
||||
|
||||
|
|
|
@ -38,6 +38,11 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# add some cases where SENT_START == -1
|
||||
train_examples[0].reference[10].is_sent_start = False
|
||||
train_examples[1].reference[1].is_sent_start = False
|
||||
train_examples[1].reference[11].is_sent_start = False
|
||||
|
||||
nlp.add_pipe(senter)
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ def test_issue2070():
|
|||
assert len(doc) == 11
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2179():
|
||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||
nlp = Italian()
|
||||
|
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
|
|||
assert len(matches) == 3
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2482():
|
||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||
nlp = Italian()
|
||||
|
|
|
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
|
|||
assert doc[0].like_num
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2800():
|
||||
"""Test issue that arises when too many labels are added to NER model.
|
||||
Used to cause segfault.
|
||||
"""
|
||||
nlp = English()
|
||||
train_data = []
|
||||
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
|
||||
train_data.extend(
|
||||
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
|
||||
)
|
||||
entity_types = [str(i) for i in range(1000)]
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
|
|
|
@ -88,6 +88,7 @@ def test_issue3199():
|
|||
assert list(doc[0:3].noun_chunks) == []
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3209():
|
||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||
mapped to classes incorrectly after loading the model, when the labels
|
||||
|
|
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
|
@ -0,0 +1,472 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import EntityRuler, DependencyParser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
from spacy import displacy, load
|
||||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.errors import MatchPatternError
|
||||
from spacy.util import minibatch
|
||||
from spacy.gold import Example
|
||||
from spacy.lang.hi import Hindi
|
||||
from spacy.lang.es import Spanish
|
||||
from spacy.lang.en import English
|
||||
from spacy.attrs import IS_ALPHA
|
||||
from thinc.api import compounding
|
||||
import spacy
|
||||
import srsly
|
||||
import numpy
|
||||
|
||||
from ..util import make_tempdir, get_doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
||||
|
||||
|
||||
def test_issue_3526_1(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert ruler.overwrite
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.overwrite == ruler.overwrite
|
||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||
|
||||
|
||||
def test_issue_3526_2(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_issue_3526_3(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
with make_tempdir() as tmpdir:
|
||||
out_file = tmpdir / "entity_ruler"
|
||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue_3526_4(en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||
nlp.add_pipe(ruler)
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
ruler = nlp.get_pipe("entity_ruler")
|
||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert ruler.overwrite is True
|
||||
nlp2 = load(tmpdir)
|
||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert new_ruler.overwrite is True
|
||||
|
||||
|
||||
def test_issue3531():
|
||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||
example_dep = {
|
||||
"words": [
|
||||
{"text": "But", "tag": "CCONJ"},
|
||||
{"text": "Google", "tag": "PROPN"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "starting", "tag": "VERB"},
|
||||
{"text": "from", "tag": "ADP"},
|
||||
{"text": "behind.", "tag": "ADV"},
|
||||
],
|
||||
"arcs": [
|
||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||
],
|
||||
}
|
||||
example_ent = {
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
}
|
||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||
assert dep_html
|
||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||
assert ent_html
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = numpy.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||||
|
||||
|
||||
def test_issue3549(en_vocab):
|
||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
matcher.add("GOOD", [pattern])
|
||||
with pytest.raises(MatchPatternError):
|
||||
matcher.add("BAD", [[{"X": "Y"}]])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
||||
|
||||
|
||||
def test_issue3611():
|
||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
nlp = spacy.blank("en")
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||
nlp = Hindi()
|
||||
doc = nlp("hi. how हुए. होटल, होटल")
|
||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||
assert [token.text for token in doc] == expected
|
||||
|
||||
|
||||
def test_issue3803():
|
||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||
nlp = Spanish()
|
||||
text = "2 dos 1000 mil 12 doce"
|
||||
doc = nlp(text)
|
||||
|
||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" in parser.labels
|
||||
|
||||
|
||||
def test_issue3839(en_vocab):
|
||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||
matcher = Matcher(en_vocab)
|
||||
match_id = "PATTERN"
|
||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
matcher.add(match_id, [pattern1])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add(match_id, [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence",
|
||||
[
|
||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||
],
|
||||
)
|
||||
def test_issue3869(sentence):
|
||||
"""Test that the Doc's count_by function works consistently"""
|
||||
nlp = English()
|
||||
doc = nlp(sentence)
|
||||
count = 0
|
||||
for token in doc:
|
||||
count += token.is_alpha
|
||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||
|
||||
|
||||
def test_issue3879(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
assert len(doc) == 5
|
||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [pattern])
|
||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
Fixed in v7.0.5 of Thinc.
|
||||
"""
|
||||
texts = ["hello", "world", "", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("parser").add_label("dep")
|
||||
nlp.get_pipe("ner").add_label("PERSON")
|
||||
nlp.get_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
||||
|
||||
|
||||
def test_issue3951(en_vocab):
|
||||
"""Test that combinations of optional rules are matched correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"LOWER": "hello"},
|
||||
{"LOWER": "this", "OP": "?"},
|
||||
{"OP": "?"},
|
||||
{"LOWER": "world"},
|
||||
]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
assert doc2[0].pos_ == "NOUN"
|
||||
|
||||
|
||||
def test_issue3962(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
# head set to itself, being the new artificial root
|
||||
assert doc2[0].head.text == "jests"
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||
assert doc2[3].dep_ == "dep"
|
||||
# We should still have 1 sentence
|
||||
assert len(list(doc2.sents)) == 1
|
||||
span3 = doc[6:9] # "never felt a"
|
||||
doc3 = span3.as_doc()
|
||||
doc3_json = doc3.to_json()
|
||||
assert doc3_json
|
||||
assert doc3[0].head.text == "felt"
|
||||
assert doc3[0].dep_ == "neg"
|
||||
assert doc3[1].head.text == "felt"
|
||||
assert doc3[1].dep_ == "ROOT"
|
||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||
assert doc3[2].dep_ == "dep"
|
||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||
assert len(list(doc3.sents)) == 1
|
||||
|
||||
|
||||
def test_issue3962_long(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
# head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].head.text == "jests"
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
# head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].head.text == "They"
|
||||
assert doc2[4].dep_ == "dep"
|
||||
# head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].head.text == "They"
|
||||
assert doc2[4].dep_ == "dep"
|
||||
# We should still have 2 sentences
|
||||
sents = list(doc2.sents)
|
||||
assert len(sents) == 2
|
||||
assert sents[0].text == "jests at scars ."
|
||||
assert sents[1].text == "They never"
|
||||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert len(matches) == 2
|
||||
|
||||
# We should have a match for each of the two rules
|
||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||
assert "A" in found_ids
|
||||
assert "B" in found_ids
|
|
@ -1,8 +0,0 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
|
@ -1,85 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Span
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy import load
|
||||
import srsly
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patterns():
|
||||
return [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def add_ent():
|
||||
def add_ent_component(doc):
|
||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
||||
return doc
|
||||
|
||||
return add_ent_component
|
||||
|
||||
|
||||
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert ruler.overwrite
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.overwrite == ruler.overwrite
|
||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||
|
||||
|
||||
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
with make_tempdir() as tmpdir:
|
||||
out_file = tmpdir / "entity_ruler"
|
||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
|
||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||
nlp.add_pipe(ruler)
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
ruler = nlp.get_pipe("entity_ruler")
|
||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert ruler.overwrite is True
|
||||
nlp2 = load(tmpdir)
|
||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert new_ruler.overwrite is True
|
|
@ -1,30 +0,0 @@
|
|||
from spacy import displacy
|
||||
|
||||
|
||||
def test_issue3531():
|
||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||
example_dep = {
|
||||
"words": [
|
||||
{"text": "But", "tag": "CCONJ"},
|
||||
{"text": "Google", "tag": "PROPN"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "starting", "tag": "VERB"},
|
||||
{"text": "from", "tag": "ADP"},
|
||||
{"text": "behind.", "tag": "ADV"},
|
||||
],
|
||||
"arcs": [
|
||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||
],
|
||||
}
|
||||
example_ent = {
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
}
|
||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||
assert dep_html
|
||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||
assert ent_html
|
|
@ -1,44 +0,0 @@
|
|||
from spacy.tokens import Doc
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = np.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
@ -1,12 +0,0 @@
|
|||
import pytest
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.errors import MatchPatternError
|
||||
|
||||
|
||||
def test_issue3549(en_vocab):
|
||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
matcher.add("GOOD", [pattern])
|
||||
with pytest.raises(MatchPatternError):
|
||||
matcher.add("BAD", [[{"X": "Y"}]])
|
|
@ -1,14 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
|
@ -1,45 +0,0 @@
|
|||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
def test_issue3611():
|
||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
|
@ -1,9 +0,0 @@
|
|||
from spacy.lang.hi import Hindi
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||
nlp = Hindi()
|
||||
doc = nlp("hi. how हुए. होटल, होटल")
|
||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||
assert [token.text for token in doc] == expected
|
|
@ -1,10 +0,0 @@
|
|||
from spacy.lang.es import Spanish
|
||||
|
||||
|
||||
def test_issue3803():
|
||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||
nlp = Spanish()
|
||||
text = "2 dos 1000 mil 12 doce"
|
||||
doc = nlp(text)
|
||||
|
||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.pipeline.pipes import DependencyParser
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
|
||||
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" in parser.labels
|
|
@ -1,18 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3839(en_vocab):
|
||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||
matcher = Matcher(en_vocab)
|
||||
match_id = "PATTERN"
|
||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
matcher.add(match_id, [pattern1])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add(match_id, [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
@ -1,25 +0,0 @@
|
|||
import pytest
|
||||
from spacy.attrs import IS_ALPHA
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence",
|
||||
[
|
||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||
],
|
||||
)
|
||||
def test_issue3869(sentence):
|
||||
"""Test that the Doc's count_by function works consistently"""
|
||||
nlp = English()
|
||||
doc = nlp(sentence)
|
||||
|
||||
count = 0
|
||||
for token in doc:
|
||||
count += token.is_alpha
|
||||
|
||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
|
@ -1,11 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3879(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
assert len(doc) == 5
|
||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [pattern])
|
||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
|
@ -1,21 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
Fixed in v7.0.5 of Thinc.
|
||||
"""
|
||||
texts = ["hello", "world", "", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("parser").add_label("dep")
|
||||
nlp.get_pipe("ner").add_label("PERSON")
|
||||
nlp.get_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
|
@ -1,12 +0,0 @@
|
|||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
|
@ -1,17 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3951(en_vocab):
|
||||
"""Test that combinations of optional rules are matched correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"LOWER": "hello"},
|
||||
{"LOWER": "this", "OP": "?"},
|
||||
{"OP": "?"},
|
||||
{"LOWER": "world"},
|
||||
]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
|
@ -1,26 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
|
||||
assert doc2[0].pos_ == "NOUN"
|
|
@ -1,117 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
text = "He jests at scars, that never felt a wound."
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
deps = [
|
||||
"nsubj",
|
||||
"ccomp",
|
||||
"prep",
|
||||
"pobj",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"neg",
|
||||
"ROOT",
|
||||
"det",
|
||||
"dobj",
|
||||
"punct",
|
||||
]
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_issue3962(doc):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||
assert doc2[3].dep_ == "dep"
|
||||
|
||||
# We should still have 1 sentence
|
||||
assert len(list(doc2.sents)) == 1
|
||||
|
||||
span3 = doc[6:9] # "never felt a"
|
||||
doc3 = span3.as_doc()
|
||||
doc3_json = doc3.to_json()
|
||||
assert doc3_json
|
||||
|
||||
assert doc3[0].head.text == "felt"
|
||||
assert doc3[0].dep_ == "neg"
|
||||
assert doc3[1].head.text == "felt"
|
||||
assert doc3[1].dep_ == "ROOT"
|
||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||
assert doc3[2].dep_ == "dep"
|
||||
|
||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||
assert len(list(doc3.sents)) == 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def two_sent_doc(en_tokenizer):
|
||||
text = "He jests at scars. They never felt a wound."
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
deps = [
|
||||
"nsubj",
|
||||
"ROOT",
|
||||
"prep",
|
||||
"pobj",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"neg",
|
||||
"ROOT",
|
||||
"det",
|
||||
"dobj",
|
||||
"punct",
|
||||
]
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_issue3962_long(two_sent_doc):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
|
||||
# We should still have 2 sentences
|
||||
sents = list(doc2.sents)
|
||||
assert len(sents) == 2
|
||||
assert sents[0].text == "jests at scars ."
|
||||
assert sents[1].text == "They never"
|
|
@ -1,19 +0,0 @@
|
|||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert len(matches) == 2
|
||||
|
||||
# We should have a match for each of the two rules
|
||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||
assert "A" in found_ids
|
||||
assert "B" in found_ids
|
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
|
@ -0,0 +1,469 @@
|
|||
import pytest
|
||||
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.matcher import PhraseMatcher, Matcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.gold import Example, Corpus
|
||||
from spacy.gold.converters import json2docs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch, ensure_path, load_model
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lang.el import Greek
|
||||
from spacy.language import Language
|
||||
import spacy
|
||||
from thinc.api import compounding
|
||||
from collections import defaultdict
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern1])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||
pattern2[0].norm_ = "c"
|
||||
pattern2[1].norm_ = "d"
|
||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
||||
|
||||
def test_issue4030():
|
||||
""" Test whether textcat works fine with empty doc """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
nlp = spacy.blank("en")
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
assert doc.cats["inoffensive"] == 0.0
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = load_model(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
||||
|
||||
def test_issue4054(en_vocab):
|
||||
"""Test that a new blank model can be made with a vocab from file,
|
||||
and that serialization does not drop the language at any point."""
|
||||
nlp1 = English()
|
||||
vocab1 = nlp1.vocab
|
||||
with make_tempdir() as d:
|
||||
vocab_dir = ensure_path(d / "vocab")
|
||||
if not vocab_dir.exists():
|
||||
vocab_dir.mkdir()
|
||||
vocab1.to_disk(vocab_dir)
|
||||
vocab2 = Vocab().from_disk(vocab_dir)
|
||||
print("lang", vocab2.lang)
|
||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||
nlp_dir = ensure_path(d / "nlp")
|
||||
if not nlp_dir.exists():
|
||||
nlp_dir.mkdir()
|
||||
nlp2.to_disk(nlp_dir)
|
||||
nlp3 = load_model(nlp_dir)
|
||||
assert nlp3.lang == "en"
|
||||
|
||||
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc3)) == 2 # works
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
doc_bytes = doc.to_bytes()
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
assert actual == pos
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
nlp.tokenizer = new_tokenizer
|
||||
|
||||
test_string = "Test c."
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
||||
|
||||
|
||||
def test_multiple_predictions():
|
||||
class DummyPipe(Pipe):
|
||||
def __init__(self):
|
||||
self.model = "dummy_model"
|
||||
|
||||
def predict(self, docs):
|
||||
return ([1, 2, 3], [4, 5, 6])
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
return docs
|
||||
|
||||
nlp = Language()
|
||||
doc = nlp.make_doc("foo")
|
||||
dummy_pipe = DummyPipe()
|
||||
dummy_pipe(doc)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
nlp.add_pipe(tagger)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
|
||||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||
|
||||
|
||||
def test_issue4373():
|
||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||
matcher = Matcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
matcher = PhraseMatcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
json_data = {
|
||||
"id": 0,
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "How", "ner": "O"},
|
||||
{"id": 1, "orth": "should", "ner": "O"},
|
||||
{"id": 2, "orth": "I", "ner": "O"},
|
||||
{"id": 3, "orth": "cook", "ner": "O"},
|
||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||
{"id": 5, "orth": "in", "ner": "O"},
|
||||
{"id": 6, "orth": "an", "ner": "O"},
|
||||
{"id": 7, "orth": "oven", "ner": "O"},
|
||||
{"id": 8, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 9, "orth": "\n", "ner": "O"},
|
||||
{"id": 10, "orth": "I", "ner": "O"},
|
||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||
{"id": 12, "orth": "heard", "ner": "O"},
|
||||
{"id": 13, "orth": "of", "ner": "O"},
|
||||
{"id": 14, "orth": "people", "ner": "O"},
|
||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||
{"id": 17, "orth": "in", "ner": "O"},
|
||||
{"id": 18, "orth": "an", "ner": "O"},
|
||||
{"id": 19, "orth": "oven", "ner": "O"},
|
||||
{"id": 20, "orth": ".", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 1.0},
|
||||
{"label": "not_baking", "value": 0.0},
|
||||
],
|
||||
},
|
||||
{
|
||||
"raw": "What is the difference between white and brown eggs?\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "What", "ner": "O"},
|
||||
{"id": 1, "orth": "is", "ner": "O"},
|
||||
{"id": 2, "orth": "the", "ner": "O"},
|
||||
{"id": 3, "orth": "difference", "ner": "O"},
|
||||
{"id": 4, "orth": "between", "ner": "O"},
|
||||
{"id": 5, "orth": "white", "ner": "O"},
|
||||
{"id": 6, "orth": "and", "ner": "O"},
|
||||
{"id": 7, "orth": "brown", "ner": "O"},
|
||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||
{"id": 9, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 0.0},
|
||||
{"label": "not_baking", "value": 1.0},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
nlp = English()
|
||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
|
||||
train_data = list(corpus.train_dataset(nlp))
|
||||
assert len(train_data) == 2
|
||||
|
||||
split_train_data = []
|
||||
for eg in train_data:
|
||||
split_train_data.extend(eg.split_sents())
|
||||
assert len(split_train_data) == 4
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern1])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||
pattern2[0].norm_ = "c"
|
||||
pattern2[1].norm_ = "d"
|
||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
|
@ -1,50 +0,0 @@
|
|||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
def test_issue4030():
|
||||
""" Test whether textcat works fine with empty doc """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
assert doc.cats["inoffensive"] == 0.0
|
|
@ -1,85 +0,0 @@
|
|||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import ensure_path
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = spacy.load(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
|
@ -1,30 +0,0 @@
|
|||
from spacy.vocab import Vocab
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import ensure_path
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4054(en_vocab):
|
||||
"""Test that a new blank model can be made with a vocab from file,
|
||||
and that serialization does not drop the language at any point."""
|
||||
nlp1 = English()
|
||||
vocab1 = nlp1.vocab
|
||||
|
||||
with make_tempdir() as d:
|
||||
vocab_dir = ensure_path(d / "vocab")
|
||||
if not vocab_dir.exists():
|
||||
vocab_dir.mkdir()
|
||||
vocab1.to_disk(vocab_dir)
|
||||
|
||||
vocab2 = Vocab().from_disk(vocab_dir)
|
||||
print("lang", vocab2.lang)
|
||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||
|
||||
nlp_dir = ensure_path(d / "nlp")
|
||||
if not nlp_dir.exists():
|
||||
nlp_dir.mkdir()
|
||||
nlp2.to_disk(nlp_dir)
|
||||
nlp3 = spacy.load(nlp_dir)
|
||||
assert nlp3.lang == "en"
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc3)) == 2 # works
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
|
@ -1,28 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
doc_bytes = doc.to_bytes()
|
||||
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
|
||||
assert actual == pos
|
|
@ -1,46 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy import util
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
test_string = "Test c."
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = util.load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
nlp.tokenizer = new_tokenizer
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
|
@ -1,9 +0,0 @@
|
|||
from spacy.lang.el import Greek
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
|
@ -1,25 +0,0 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import Pipe
|
||||
|
||||
|
||||
class DummyPipe(Pipe):
|
||||
def __init__(self):
|
||||
self.model = "dummy_model"
|
||||
|
||||
def predict(self, docs):
|
||||
return ([1, 2, 3], [4, 5, 6])
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
return docs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return Language()
|
||||
|
||||
|
||||
def test_multiple_predictions(nlp):
|
||||
doc = nlp.make_doc("foo")
|
||||
dummy_pipe = DummyPipe()
|
||||
dummy_pipe(doc)
|
|
@ -1,47 +0,0 @@
|
|||
from collections import defaultdict
|
||||
|
||||
import pytest
|
||||
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
||||
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
||||
@pytest.mark.skip
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
|
@ -1,24 +0,0 @@
|
|||
from spacy.gold import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
|
||||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
nlp.add_pipe(tagger)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
@ -1,8 +0,0 @@
|
|||
from spacy.tokens import DocBin
|
||||
|
||||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
|
@ -1,10 +0,0 @@
|
|||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4373():
|
||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||
matcher = Matcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
matcher = PhraseMatcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
|
@ -1,98 +0,0 @@
|
|||
from spacy.gold import Corpus
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...gold.converters import json2docs
|
||||
from ...tokens import DocBin
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
nlp = English()
|
||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
|
||||
train_data = list(corpus.train_dataset(nlp))
|
||||
assert len(train_data) == 2
|
||||
|
||||
split_train_data = []
|
||||
for eg in train_data:
|
||||
split_train_data.extend(eg.split_sents())
|
||||
assert len(split_train_data) == 4
|
||||
|
||||
|
||||
json_data = {
|
||||
"id": 0,
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "How", "ner": "O"},
|
||||
{"id": 1, "orth": "should", "ner": "O"},
|
||||
{"id": 2, "orth": "I", "ner": "O"},
|
||||
{"id": 3, "orth": "cook", "ner": "O"},
|
||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||
{"id": 5, "orth": "in", "ner": "O"},
|
||||
{"id": 6, "orth": "an", "ner": "O"},
|
||||
{"id": 7, "orth": "oven", "ner": "O"},
|
||||
{"id": 8, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 9, "orth": "\n", "ner": "O"},
|
||||
{"id": 10, "orth": "I", "ner": "O"},
|
||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||
{"id": 12, "orth": "heard", "ner": "O"},
|
||||
{"id": 13, "orth": "of", "ner": "O"},
|
||||
{"id": 14, "orth": "people", "ner": "O"},
|
||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||
{"id": 17, "orth": "in", "ner": "O"},
|
||||
{"id": 18, "orth": "an", "ner": "O"},
|
||||
{"id": 19, "orth": "oven", "ner": "O"},
|
||||
{"id": 20, "orth": ".", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 1.0},
|
||||
{"label": "not_baking", "value": 0.0},
|
||||
],
|
||||
},
|
||||
{
|
||||
"raw": "What is the difference between white and brown eggs?\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "What", "ner": "O"},
|
||||
{"id": 1, "orth": "is", "ner": "O"},
|
||||
{"id": 2, "orth": "the", "ner": "O"},
|
||||
{"id": 3, "orth": "difference", "ner": "O"},
|
||||
{"id": 4, "orth": "between", "ner": "O"},
|
||||
{"id": 5, "orth": "white", "ner": "O"},
|
||||
{"id": 6, "orth": "and", "ner": "O"},
|
||||
{"id": 7, "orth": "brown", "ner": "O"},
|
||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||
{"id": 9, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 0.0},
|
||||
{"label": "not_baking", "value": 1.0},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
|
@ -0,0 +1,288 @@
|
|||
import pytest
|
||||
from mock import Mock
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.gold import Example
|
||||
from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.util import ensure_path, load_model_from_path
|
||||
import numpy
|
||||
import pickle
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
|
||||
|
||||
def test_issue4528(en_vocab):
|
||||
"""Test that user_data is correctly serialized in DocBin."""
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
# This is how extension attribute values are stored in the user data
|
||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
doc_bin.add(doc)
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||
)
|
||||
def test_gold_misaligned(en_tokenizer, text, words):
|
||||
doc = en_tokenizer(text)
|
||||
Example.from_dict(doc, {"words": words})
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
matches = matcher(doc)
|
||||
on_match_args = on_match.call_args
|
||||
assert on_match_args[0][3] == matches
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
input_data = """
|
||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||
2 This _ DET DT _ _ det _ _
|
||||
3 killing _ NOUN NN _ _ nsubj _ _
|
||||
4 of _ ADP IN _ _ case _ _
|
||||
5 a _ DET DT _ _ det _ _
|
||||
6 respected _ ADJ JJ _ _ amod _ _
|
||||
7 cleric _ NOUN NN _ _ nmod _ _
|
||||
8 will _ AUX MD _ _ aux _ _
|
||||
9 be _ AUX VB _ _ aux _ _
|
||||
10 causing _ VERB VBG _ _ root _ _
|
||||
11 us _ PRON PRP _ _ iobj _ _
|
||||
12 trouble _ NOUN NN _ _ dobj _ _
|
||||
13 for _ ADP IN _ _ case _ _
|
||||
14 years _ NOUN NNS _ _ nmod _ _
|
||||
15 to _ PART TO _ _ mark _ _
|
||||
16 come _ VERB VB _ _ acl _ _
|
||||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
conllu2docs(input_data)
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
kb.set_entities(
|
||||
entity_list=["Q1", "Q1"],
|
||||
freq_list=[32, 111],
|
||||
vector_list=[vector1, vector2],
|
||||
)
|
||||
assert kb.get_size_entities() == 1
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4725_1():
|
||||
""" Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4725_2():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
ruler = EntityRuler(
|
||||
nlp,
|
||||
patterns=[
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
],
|
||||
phrase_matcher_attr="LOWER",
|
||||
)
|
||||
nlp.add_pipe(ruler)
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
name = "my_pipe"
|
||||
|
||||
def __init__(self):
|
||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||
Doc.set_extension("my_ext", default=None)
|
||||
|
||||
def __call__(self, doc):
|
||||
gathered_ext = []
|
||||
for sent in doc.sents:
|
||||
sent_ext = self._get_my_ext(sent)
|
||||
sent._.set("my_ext", sent_ext)
|
||||
gathered_ext.append(sent_ext)
|
||||
|
||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _get_my_ext(span):
|
||||
return str(span.end)
|
||||
|
||||
|
||||
def test_issue4903():
|
||||
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
|
||||
macOS."""
|
||||
nlp = English()
|
||||
custom_component = CustomPipe()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(custom_component, after="sentencizer")
|
||||
|
||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||
docs = list(nlp.pipe(text, n_process=2))
|
||||
assert docs[0].text == "I like bananas."
|
||||
assert docs[1].text == "Do you like them?"
|
||||
assert docs[2].text == "No, I prefer wasabi."
|
||||
|
||||
|
||||
def test_issue4924():
|
||||
nlp = Language()
|
||||
example = Example.from_dict(nlp.make_doc(""), {})
|
||||
nlp.evaluate([example])
|
|
@ -1,16 +0,0 @@
|
|||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
|
||||
def test_issue4528(en_vocab):
|
||||
"""Test that user_data is correctly serialized in DocBin."""
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
# This is how extension attribute values are stored in the user data
|
||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
doc_bin.add(doc)
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
|
@ -1,11 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||
)
|
||||
def test_gold_misaligned(en_tokenizer, text, words):
|
||||
doc = en_tokenizer(text)
|
||||
Example.from_dict(doc, {"words": words})
|
|
@ -1,35 +0,0 @@
|
|||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
|
||||
matches = matcher(doc)
|
||||
|
||||
on_match_args = on_match.call_args
|
||||
|
||||
assert on_match_args[0][3] == matches
|
|
@ -1,62 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
|
@ -1,35 +0,0 @@
|
|||
import pytest
|
||||
|
||||
# TODO
|
||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
|
||||
input_data = """
|
||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||
2 This _ DET DT _ _ det _ _
|
||||
3 killing _ NOUN NN _ _ nsubj _ _
|
||||
4 of _ ADP IN _ _ case _ _
|
||||
5 a _ DET DT _ _ det _ _
|
||||
6 respected _ ADJ JJ _ _ amod _ _
|
||||
7 cleric _ NOUN NN _ _ nmod _ _
|
||||
8 will _ AUX MD _ _ aux _ _
|
||||
9 be _ AUX VB _ _ aux _ _
|
||||
10 causing _ VERB VBG _ _ root _ _
|
||||
11 us _ PRON PRP _ _ iobj _ _
|
||||
12 trouble _ NOUN NN _ _ dobj _ _
|
||||
13 for _ ADP IN _ _ case _ _
|
||||
14 years _ NOUN NNS _ _ nmod _ _
|
||||
15 to _ PART TO _ _ mark _ _
|
||||
16 come _ VERB VB _ _ acl _ _
|
||||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
pass
|
||||
# conllu2json(input_data)
|
|
@ -1,36 +0,0 @@
|
|||
import pytest
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.util import ensure_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
kb.set_entities(
|
||||
entity_list=["Q1", "Q1"],
|
||||
freq_list=[32, 111],
|
||||
vector_list=[vector1, vector2],
|
||||
)
|
||||
|
||||
assert kb.get_size_entities() == 1
|
||||
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
|
||||
assert kb2.get_size_entities() == 1
|
|
@ -1,20 +0,0 @@
|
|||
from spacy.util import load_model_from_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
@ -1,41 +0,0 @@
|
|||
import pickle
|
||||
import numpy
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_pickle_ner():
|
||||
""" Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
|
||||
|
||||
def test_issue4725():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
|
||||
ruler = EntityRuler(
|
||||
nlp,
|
||||
patterns=[
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
],
|
||||
phrase_matcher_attr="LOWER",
|
||||
)
|
||||
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
|
@ -1,40 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span, Doc
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
name = "my_pipe"
|
||||
|
||||
def __init__(self):
|
||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||
Doc.set_extension("my_ext", default=None)
|
||||
|
||||
def __call__(self, doc):
|
||||
gathered_ext = []
|
||||
for sent in doc.sents:
|
||||
sent_ext = self._get_my_ext(sent)
|
||||
sent._.set("my_ext", sent_ext)
|
||||
gathered_ext.append(sent_ext)
|
||||
|
||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _get_my_ext(span):
|
||||
return str(span.end)
|
||||
|
||||
|
||||
def test_issue4903():
|
||||
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
||||
|
||||
nlp = English()
|
||||
custom_component = CustomPipe()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(custom_component, after="sentencizer")
|
||||
|
||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||
docs = list(nlp.pipe(text, n_process=2))
|
||||
assert docs[0].text == "I like bananas."
|
||||
assert docs[1].text == "Do you like them?"
|
||||
assert docs[2].text == "No, I prefer wasabi."
|
|
@ -1,8 +0,0 @@
|
|||
from spacy.gold import Example
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
def test_issue4924():
|
||||
nlp = Language()
|
||||
example = Example.from_dict(nlp.make_doc(""), {})
|
||||
nlp.evaluate([example])
|
|
@ -1,6 +1,8 @@
|
|||
import pytest
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue5152():
|
||||
# Test that the comparison between a Span and a Token, goes well
|
||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||
|
@ -8,7 +10,6 @@ def test_issue5152():
|
|||
text = nlp("Talk about being boring!")
|
||||
text_var = nlp("Talk of being boring!")
|
||||
y = nlp("Let")
|
||||
|
||||
span = text[0:3] # Talk about being
|
||||
span_2 = text[0:3] # Talk about being
|
||||
span_3 = text_var[0:3] # Talk of being
|
||||
|
|
|
@ -63,7 +63,8 @@ def tagger():
|
|||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
return tagger
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from spacy.errors import AlignmentError
|
||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
|
||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||
from spacy.gold import Corpus, docs_to_json
|
||||
from spacy.gold.example import Example
|
||||
from spacy.gold.converters import json2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.util import get_words_and_spaces, minibatch
|
||||
from thinc.api import compounding
|
||||
|
@ -271,75 +272,76 @@ def test_split_sentences(en_vocab):
|
|||
assert split_examples[1].text == "had loads of fun "
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
|
||||
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||
words = ["I", "flew to", "San Francisco Valley", "."]
|
||||
spaces = [True, True, False, False]
|
||||
words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||
spaces = [True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "U-LOC", "O"]
|
||||
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("I "), len("I flew to"), "ORG"),
|
||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
|
||||
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("I "), len("I flew"), "ORG"),
|
||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||
(len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", None, "U-LOC", "O"]
|
||||
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, True, True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||
words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("I "), len("I flew to"), "ORG"),
|
||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
|
||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("I "), len("I flew to"), "ORG"),
|
||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
|
||||
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
||||
|
@ -349,7 +351,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
|||
"I flew to San Francisco Valley.",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
prefix = "I flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
|
||||
gold_spaces = [True, True, False, True, False, False]
|
||||
example = Example.from_dict(
|
||||
|
@ -405,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
|
|||
assert spans[1].label_ == "GPE"
|
||||
|
||||
|
||||
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
|
||||
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
|
||||
spaces = [True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [
|
||||
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||
ents_ref = example.reference.ents
|
||||
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
|
||||
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
|
||||
|
||||
|
||||
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
|
||||
text = "Mr and Mrs Smith flew to San Francisco Valley"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
|
||||
{"label": "LOC", "pattern": "San Francisco Valley"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [
|
||||
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
|
||||
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
|
||||
|
||||
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
|
||||
ents_pred = example.predicted.ents
|
||||
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
|
||||
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
|
||||
|
||||
|
||||
def test_gold_ner_missing_tags(en_tokenizer):
|
||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||
|
@ -412,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
|||
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
||||
|
||||
|
||||
def test_projectivize(en_tokenizer):
|
||||
doc = en_tokenizer("He pretty quickly walks away")
|
||||
heads = [3, 2, 3, 0, 2]
|
||||
example = Example.from_dict(doc, {"heads": heads})
|
||||
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||
assert proj_heads == [3, 2, 3, 0, 3]
|
||||
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||
|
||||
|
||||
def test_iob_to_biluo():
|
||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||
|
@ -514,6 +570,7 @@ def test_make_orth_variants(doc):
|
|||
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
|
||||
|
||||
@pytest.mark.skip("Outdated")
|
||||
@pytest.mark.parametrize(
|
||||
"tokens_a,tokens_b,expected",
|
||||
[
|
||||
|
@ -537,12 +594,12 @@ def test_make_orth_variants(doc):
|
|||
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
||||
],
|
||||
)
|
||||
def test_align(tokens_a, tokens_b, expected):
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
|
||||
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
|
||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa
|
||||
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa
|
||||
# check symmetry
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
|
||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa
|
||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa
|
||||
|
||||
|
||||
def test_goldparse_startswith_space(en_tokenizer):
|
||||
|
@ -556,7 +613,7 @@ def test_goldparse_startswith_space(en_tokenizer):
|
|||
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
|
||||
)
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == [None, "U-DATE"]
|
||||
assert ner_tags == ["O", "U-DATE"]
|
||||
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
||||
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ def test_aligned_tags():
|
|||
predicted = Doc(vocab, words=pred_words)
|
||||
example = Example.from_dict(predicted, annots)
|
||||
aligned_tags = example.get_aligned("tag", as_string=True)
|
||||
assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
|
||||
assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
|
||||
|
||||
|
||||
def test_aligned_tags_multi():
|
||||
|
|
31
spacy/tests/test_projects.py
Normal file
31
spacy/tests/test_projects.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import pytest
|
||||
from spacy.cli.project.util import validate_project_commands
|
||||
from spacy.schemas import ProjectConfigSchema, validate
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"config",
|
||||
[
|
||||
{"commands": [{"name": "a"}, {"name": "a"}]},
|
||||
{"commands": [{"name": "a"}], "workflows": {"a": []}},
|
||||
{"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
|
||||
],
|
||||
)
|
||||
def test_project_config_validation1(config):
|
||||
with pytest.raises(SystemExit):
|
||||
validate_project_commands(config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"config,n_errors",
|
||||
[
|
||||
({"commands": {"a": []}}, 1),
|
||||
({"commands": [{"help": "..."}]}, 1),
|
||||
({"commands": [{"name": "a", "extra": "b"}]}, 1),
|
||||
({"commands": [{"extra": "b"}]}, 2),
|
||||
({"commands": [{"name": "a", "deps": [123]}]}, 1),
|
||||
],
|
||||
)
|
||||
def test_project_config_validation2(config, n_errors):
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
assert len(errors) == n_errors
|
|
@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]:
|
|||
return shlex.split(command, posix=not is_windows)
|
||||
|
||||
|
||||
def join_command(command: List[str]) -> str:
|
||||
"""Join a command using shlex. shlex.join is only available for Python 3.8+,
|
||||
so we're using a workaround here.
|
||||
|
||||
command (List[str]): The command to join.
|
||||
RETURNS (str): The joined command
|
||||
"""
|
||||
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||
|
||||
|
||||
def run_command(command: Union[str, List[str]]) -> None:
|
||||
"""Run a command on the command line as a subprocess. If the subprocess
|
||||
returns a non-zero exit code, a system exit is performed.
|
||||
|
@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str:
|
|||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||
|
||||
|
||||
def is_cwd(path: Union[Path, str]) -> bool:
|
||||
"""Check whether a path is the current working directory.
|
||||
|
||||
path (Union[Path, str]): The directory path.
|
||||
RETURNS (bool): Whether the path is the current working directory.
|
||||
"""
|
||||
return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
|
||||
|
||||
|
||||
def is_in_jupyter():
|
||||
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
||||
IPython kernel. Mainly used for the displaCy visualizer.
|
||||
|
|
Loading…
Reference in New Issue
Block a user