mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
028f8210e8
|
@ -9,27 +9,28 @@ max_length = 5000
|
||||||
limit = 0
|
limit = 0
|
||||||
# Data augmentation
|
# Data augmentation
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
dropout = 0.2
|
dropout = 0.1
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 100000
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
max_steps = 20000
|
max_steps = 0
|
||||||
eval_frequency = 500
|
eval_frequency = 1000
|
||||||
# Other settings
|
# Other settings
|
||||||
seed = 0
|
seed = 0
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 2
|
||||||
use_pytorch_for_gpu_memory = false
|
use_pytorch_for_gpu_memory = false
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
# Control how scores are printed and checkpoints are evaluated.
|
||||||
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
||||||
score_weights = {"ents_f": 1.0}
|
score_weights = {"ents_f": 1.0}
|
||||||
# These settings are invalid for the transformer models.
|
# These settings are invalid for the transformer models.
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
discard_oversize = false
|
discard_oversize = true
|
||||||
omit_extra_lookups = false
|
omit_extra_lookups = false
|
||||||
|
batch_by_words = true
|
||||||
|
|
||||||
[training.batch_size]
|
[training.batch_size]
|
||||||
@schedules = "compounding.v1"
|
@schedules = "compounding.v1"
|
||||||
start = 100
|
start = 1000
|
||||||
stop = 1000
|
stop = 1000
|
||||||
compound = 1.001
|
compound = 1.001
|
||||||
|
|
||||||
|
@ -37,18 +38,18 @@ compound = 1.001
|
||||||
@optimizers = "Adam.v1"
|
@optimizers = "Adam.v1"
|
||||||
beta1 = 0.9
|
beta1 = 0.9
|
||||||
beta2 = 0.999
|
beta2 = 0.999
|
||||||
L2_is_weight_decay = false
|
L2_is_weight_decay = true
|
||||||
L2 = 1e-6
|
L2 = 0.01
|
||||||
grad_clip = 1.0
|
grad_clip = 1.0
|
||||||
use_averages = true
|
use_averages = true
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
#[optimizer.learn_rate]
|
#[training.optimizer.learn_rate]
|
||||||
#@schedules = "warmup_linear.v1"
|
#@schedules = "warmup_linear.v1"
|
||||||
#warmup_steps = 250
|
#warmup_steps = 1000
|
||||||
#total_steps = 20000
|
#total_steps = 50000
|
||||||
#initial_rate = 0.001
|
#initial_rate = 0.003
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
|
@ -58,8 +59,6 @@ vectors = null
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
learn_tokens = false
|
learn_tokens = false
|
||||||
min_action_freq = 1
|
min_action_freq = 1
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
[nlp.pipeline.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
@ -75,6 +74,6 @@ width = 96
|
||||||
depth = 4
|
depth = 4
|
||||||
window_size = 1
|
window_size = 1
|
||||||
embed_size = 2000
|
embed_size = 2000
|
||||||
maxout_pieces = 3
|
maxout_pieces = 1
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = ${training:dropout}
|
dropout = ${training:dropout}
|
||||||
|
|
|
@ -7,6 +7,7 @@ requires = [
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a12,<8.0.0a20",
|
"thinc>=8.0.0a12,<8.0.0a20",
|
||||||
"blis>=0.4.0,<0.5.0"
|
"blis>=0.4.0,<0.5.0",
|
||||||
|
"pytokenizations"
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -14,6 +14,7 @@ numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.3.0,<2.0.0
|
||||||
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging
|
||||||
|
|
|
@ -51,6 +51,7 @@ install_requires =
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.3.0,<2.0.0
|
||||||
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -1,11 +1,11 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
from setuptools import Extension, setup, find_packages
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import platform
|
||||||
from distutils.command.build_ext import build_ext
|
from distutils.command.build_ext import build_ext
|
||||||
from distutils.sysconfig import get_python_inc
|
from distutils.sysconfig import get_python_inc
|
||||||
import distutils.util
|
import distutils.util
|
||||||
from distutils import ccompiler, msvccompiler
|
from distutils import ccompiler, msvccompiler
|
||||||
from setuptools import Extension, setup, find_packages
|
|
||||||
import numpy
|
import numpy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -23,7 +23,6 @@ Options.docstrings = True
|
||||||
|
|
||||||
PACKAGES = find_packages()
|
PACKAGES = find_packages()
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
"spacy.gold.align",
|
|
||||||
"spacy.gold.example",
|
"spacy.gold.example",
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a1"
|
__version__ = "3.0.0a2"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
|
||||||
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||||
|
|
|
@ -15,8 +15,10 @@ from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project import project_clone, project_assets, project_run # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
from .project import project_run_all # noqa: F401
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
from .project.run import project_run # noqa: F401
|
||||||
|
from .project.dvc import project_update_dvc # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli
|
DOCS: https://spacy.io/api/cli
|
||||||
"""
|
"""
|
||||||
|
PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
|
||||||
|
project templates. You'd typically start by cloning a project template to a local
|
||||||
|
directory and fetching its assets like datasets etc. See the project's
|
||||||
|
project.yml for the available commands.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
app = typer.Typer(name=NAME, help=HELP)
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
|
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||||
|
app.add_typer(project_cli)
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Sequence, Union
|
from typing import Optional, Sequence
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_base_version, run_command
|
from ..util import is_package, get_base_version, run_command
|
||||||
|
|
||||||
|
# These are the old shortcuts we previously supported in spacy download. As of
|
||||||
|
# v3, shortcuts are deprecated so we're not expecting to add anything to this
|
||||||
|
# list. It only exists to show users warnings.
|
||||||
|
OLD_SHORTCUTS = {
|
||||||
|
"en": "en_core_web_sm",
|
||||||
|
"de": "de_core_news_sm",
|
||||||
|
"es": "es_core_news_sm",
|
||||||
|
"pt": "pt_core_news_sm",
|
||||||
|
"fr": "fr_core_news_sm",
|
||||||
|
"it": "it_core_news_sm",
|
||||||
|
"nl": "nl_core_news_sm",
|
||||||
|
"el": "el_core_news_sm",
|
||||||
|
"nb": "nb_core_news_sm",
|
||||||
|
"lt": "lt_core_news_sm",
|
||||||
|
"xx": "xx_ent_wiki_sm",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
"download",
|
"download",
|
||||||
|
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
model_name = model
|
||||||
model_name = shortcuts.get(model, model)
|
if model in OLD_SHORTCUTS:
|
||||||
|
msg.warn(
|
||||||
|
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
|
||||||
|
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
|
||||||
|
)
|
||||||
|
model_name = OLD_SHORTCUTS[model]
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
|
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_json(url: str, desc: str) -> Union[dict, list]:
|
def get_compatibility() -> dict:
|
||||||
r = requests.get(url)
|
version = get_base_version(about.__version__)
|
||||||
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Server error ({r.status_code})",
|
f"Server error ({r.status_code})",
|
||||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||||
f"installation (v{about.__version__}), and download it manually. "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
f"For more details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
f"https://spacy.io/usage/models",
|
f"https://spacy.io/usage/models",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return r.json()
|
comp_table = r.json()
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility() -> dict:
|
|
||||||
version = get_base_version(about.__version__)
|
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||||
|
|
|
@ -1,708 +0,0 @@
|
||||||
from typing import List, Dict, Any, Optional, Sequence
|
|
||||||
import typer
|
|
||||||
import srsly
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import subprocess
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
import requests
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
from ._app import app, Arg, Opt, COMMAND, NAME
|
|
||||||
from .. import about
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
|
||||||
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
|
||||||
from ..util import get_hash, get_checksum, split_command
|
|
||||||
|
|
||||||
|
|
||||||
CONFIG_FILE = "project.yml"
|
|
||||||
DVC_CONFIG = "dvc.yaml"
|
|
||||||
DVC_DIR = ".dvc"
|
|
||||||
DIRS = [
|
|
||||||
"assets",
|
|
||||||
"metas",
|
|
||||||
"configs",
|
|
||||||
"packages",
|
|
||||||
"metrics",
|
|
||||||
"scripts",
|
|
||||||
"notebooks",
|
|
||||||
"training",
|
|
||||||
"corpus",
|
|
||||||
]
|
|
||||||
CACHES = [
|
|
||||||
Path.home() / ".torch",
|
|
||||||
Path.home() / ".caches" / "torch",
|
|
||||||
os.environ.get("TORCH_HOME"),
|
|
||||||
Path.home() / ".keras",
|
|
||||||
]
|
|
||||||
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
|
|
||||||
# it directly and edit the project.yml instead and re-run the project."""
|
|
||||||
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
|
|
||||||
templates. You'd typically start by cloning a project template to a local
|
|
||||||
directory and fetching its assets like datasets etc. See the project's
|
|
||||||
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
|
|
||||||
Version Control) to manage input and output files and to ensure steps are only
|
|
||||||
re-run if their inputs change.
|
|
||||||
"""
|
|
||||||
|
|
||||||
project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.callback(invoke_without_command=True)
|
|
||||||
def callback(ctx: typer.Context):
|
|
||||||
"""This runs before every project command and ensures DVC is installed."""
|
|
||||||
ensure_dvc()
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
# CLI COMMANDS #
|
|
||||||
################
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("clone")
|
|
||||||
def project_clone_cli(
|
|
||||||
# fmt: off
|
|
||||||
name: str = Arg(..., help="The name of the template to fetch"),
|
|
||||||
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
|
||||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
|
||||||
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Clone a project template from a repository. Calls into "git" and will
|
|
||||||
only download the files from the given subdirectory. The GitHub repo
|
|
||||||
defaults to the official spaCy template repo, but can be customized
|
|
||||||
(including using a private repo). Setting the --git flag will also
|
|
||||||
initialize the project directory as a Git repo. If the project is intended
|
|
||||||
to be a Git repo, it should be initialized with Git first, before
|
|
||||||
initializing DVC (Data Version Control). This allows DVC to integrate with
|
|
||||||
Git.
|
|
||||||
"""
|
|
||||||
if dest == Path.cwd():
|
|
||||||
dest = dest / name
|
|
||||||
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("init")
|
|
||||||
def project_init_cli(
|
|
||||||
# fmt: off
|
|
||||||
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Initialize a project directory with DVC and optionally Git. This should
|
|
||||||
typically be taken care of automatically when you run the "project clone"
|
|
||||||
command, but you can also run it separately. If the project is intended to
|
|
||||||
be a Git repo, it should be initialized with Git first, before initializing
|
|
||||||
DVC. This allows DVC to integrate with Git.
|
|
||||||
"""
|
|
||||||
project_init(path, git=git, force=force, silent=True)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
|
||||||
def project_assets_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Use DVC (Data Version Control) to fetch project assets. Assets are
|
|
||||||
defined in the "assets" section of the project config. If possible, DVC
|
|
||||||
will try to track the files so you can pull changes from upstream. It will
|
|
||||||
also try and store the checksum so the assets are versioned. If the file
|
|
||||||
can't be tracked or checked, it will be downloaded without DVC. If a checksum
|
|
||||||
is provided in the project config, the file is only downloaded if no local
|
|
||||||
file with the same checksum exists.
|
|
||||||
"""
|
|
||||||
project_assets(project_dir)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run-all",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_run_all_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context,
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run all commands defined in the project. This command will use DVC and
|
|
||||||
the defined outputs and dependencies in the project config to determine
|
|
||||||
which steps need to be re-run and where to start. This means you're only
|
|
||||||
re-generating data if the inputs have changed.
|
|
||||||
|
|
||||||
This command calls into "dvc repro" and all additional arguments are passed
|
|
||||||
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
|
|
||||||
"""
|
|
||||||
if show_help:
|
|
||||||
print_run_help(project_dir)
|
|
||||||
else:
|
|
||||||
project_run_all(project_dir, *ctx.args)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_run_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context,
|
|
||||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run a named script defined in the project config. If the command is
|
|
||||||
part of the default pipeline defined in the "run" section, DVC is used to
|
|
||||||
determine whether the step should re-run if its inputs have changed, or
|
|
||||||
whether everything is up to date. If the script is not part of the default
|
|
||||||
pipeline, it will be called separately without DVC.
|
|
||||||
|
|
||||||
If DVC is used, the command calls into "dvc repro" and all additional
|
|
||||||
arguments are passed to the "dvc repro" command:
|
|
||||||
https://dvc.org/doc/command-reference/repro
|
|
||||||
"""
|
|
||||||
if show_help or not subcommand:
|
|
||||||
print_run_help(project_dir, subcommand)
|
|
||||||
else:
|
|
||||||
project_run(project_dir, subcommand, *ctx.args)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("exec", hidden=True)
|
|
||||||
def project_exec_cli(
|
|
||||||
# fmt: off
|
|
||||||
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Execute a command defined in the project config. This CLI command is
|
|
||||||
only called internally in auto-generated DVC pipelines, as a shortcut for
|
|
||||||
multi-step commands in the project config. You typically shouldn't have to
|
|
||||||
call it yourself. To run a command, call "run" or "run-all".
|
|
||||||
"""
|
|
||||||
project_exec(project_dir, subcommand)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("update-dvc")
|
|
||||||
def project_update_dvc_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
|
||||||
"run" section of the project config. This typically happens automatically
|
|
||||||
when running a command, but can also be triggered manually if needed.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
|
||||||
else:
|
|
||||||
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
|
||||||
|
|
||||||
|
|
||||||
app.add_typer(project_cli, name="project")
|
|
||||||
|
|
||||||
|
|
||||||
#################
|
|
||||||
# CLI FUNCTIONS #
|
|
||||||
#################
|
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
|
||||||
name: str,
|
|
||||||
dest: Path,
|
|
||||||
*,
|
|
||||||
repo: str = about.__projects__,
|
|
||||||
git: bool = False,
|
|
||||||
no_init: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Clone a project template from a repository.
|
|
||||||
|
|
||||||
name (str): Name of subdirectory to clone.
|
|
||||||
dest (Path): Destination path of cloned project.
|
|
||||||
repo (str): URL of Git repo containing project templates.
|
|
||||||
git (bool): Initialize project as Git repo. Should be set to True if project
|
|
||||||
is intended as a repo, since it will allow DVC to integrate with Git.
|
|
||||||
no_init (bool): Don't initialize DVC and Git automatically. If True, the
|
|
||||||
"init" command or "git init" and "dvc init" need to be run manually.
|
|
||||||
"""
|
|
||||||
dest = ensure_path(dest)
|
|
||||||
check_clone(name, dest, repo)
|
|
||||||
project_dir = dest.resolve()
|
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
|
||||||
try:
|
|
||||||
run_command(cmd)
|
|
||||||
except SystemExit:
|
|
||||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
|
||||||
msg.fail(err)
|
|
||||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
|
||||||
f.write(name)
|
|
||||||
try:
|
|
||||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
|
||||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
|
||||||
except SystemExit:
|
|
||||||
err = f"Could not clone '{name}' in the repo '{repo}'."
|
|
||||||
msg.fail(err)
|
|
||||||
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
|
||||||
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
|
|
||||||
for sub_dir in DIRS:
|
|
||||||
dir_path = project_dir / sub_dir
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir(parents=True)
|
|
||||||
if not no_init:
|
|
||||||
project_init(project_dir, git=git, force=True, silent=True)
|
|
||||||
msg.good(f"Your project is now ready!", dest)
|
|
||||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_init(
|
|
||||||
project_dir: Path,
|
|
||||||
*,
|
|
||||||
git: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
silent: bool = False,
|
|
||||||
analytics: bool = False,
|
|
||||||
):
|
|
||||||
"""Initialize a project as a DVC and (optionally) as a Git repo.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
git (bool): Also call "git init" to initialize directory as a Git repo.
|
|
||||||
silent (bool): Don't print any output (via DVC).
|
|
||||||
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
|
||||||
"""
|
|
||||||
with working_dir(project_dir) as cwd:
|
|
||||||
if git:
|
|
||||||
run_command(["git", "init"])
|
|
||||||
init_cmd = ["dvc", "init"]
|
|
||||||
if silent:
|
|
||||||
init_cmd.append("--quiet")
|
|
||||||
if not git:
|
|
||||||
init_cmd.append("--no-scm")
|
|
||||||
if force:
|
|
||||||
init_cmd.append("--force")
|
|
||||||
run_command(init_cmd)
|
|
||||||
# We don't want to have analytics on by default – our users should
|
|
||||||
# opt-in explicitly. If they want it, they can always enable it.
|
|
||||||
if not analytics:
|
|
||||||
run_command(["dvc", "config", "core.analytics", "false"])
|
|
||||||
# Remove unused and confusing plot templates from .dvc directory
|
|
||||||
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
|
|
||||||
# once you commit your changes via Git and it creates a bunch of files
|
|
||||||
# that have no purpose
|
|
||||||
plots_dir = cwd / DVC_DIR / "plots"
|
|
||||||
if plots_dir.exists():
|
|
||||||
shutil.rmtree(str(plots_dir))
|
|
||||||
config = load_project_config(cwd)
|
|
||||||
setup_check_dvc(cwd, config)
|
|
||||||
|
|
||||||
|
|
||||||
def project_assets(project_dir: Path) -> None:
|
|
||||||
"""Fetch assets for a project using DVC if possible.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
"""
|
|
||||||
project_path = ensure_path(project_dir)
|
|
||||||
config = load_project_config(project_path)
|
|
||||||
setup_check_dvc(project_path, config)
|
|
||||||
assets = config.get("assets", {})
|
|
||||||
if not assets:
|
|
||||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
fetched_assets = []
|
|
||||||
for asset in assets:
|
|
||||||
url = asset["url"].format(**variables)
|
|
||||||
dest = asset["dest"].format(**variables)
|
|
||||||
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
|
|
||||||
if fetched_path:
|
|
||||||
fetched_assets.append(str(fetched_path))
|
|
||||||
if fetched_assets:
|
|
||||||
with working_dir(project_path):
|
|
||||||
run_command(["dvc", "add", *fetched_assets, "--external"])
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_asset(
|
|
||||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
|
||||||
) -> Optional[Path]:
|
|
||||||
"""Fetch an asset from a given URL or path. Will try to import the file
|
|
||||||
using DVC's import-url if possible (fully tracked and versioned) and falls
|
|
||||||
back to get-url (versioned) and a non-DVC download if necessary. If a
|
|
||||||
checksum is provided and a local file exists, it's only re-downloaded if the
|
|
||||||
checksum doesn't match.
|
|
||||||
|
|
||||||
project_path (Path): Path to project directory.
|
|
||||||
url (str): URL or path to asset.
|
|
||||||
checksum (Optional[str]): Optional expected checksum of local file.
|
|
||||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
|
||||||
the asset failed.
|
|
||||||
"""
|
|
||||||
url = convert_asset_url(url)
|
|
||||||
dest_path = (project_path / dest).resolve()
|
|
||||||
if dest_path.exists() and checksum:
|
|
||||||
# If there's already a file, check for checksum
|
|
||||||
# TODO: add support for caches (dvc import-url with local path)
|
|
||||||
if checksum == get_checksum(dest_path):
|
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
|
||||||
return dest_path
|
|
||||||
with working_dir(project_path):
|
|
||||||
try:
|
|
||||||
# If these fail, we don't want to output an error or info message.
|
|
||||||
# Try with tracking the source first, then just downloading with
|
|
||||||
# DVC, then a regular non-DVC download.
|
|
||||||
try:
|
|
||||||
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
try:
|
|
||||||
download_file(url, dest_path)
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
|
||||||
return None
|
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
|
||||||
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
|
||||||
msg.good(f"Fetched asset {dest}")
|
|
||||||
return dest_path
|
|
||||||
|
|
||||||
|
|
||||||
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
|
||||||
"""Run all commands defined in the project using DVC.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
*dvc_args: Other arguments passed to "dvc repro".
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
setup_check_dvc(project_dir, config)
|
|
||||||
dvc_cmd = ["dvc", "repro", *dvc_args]
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_command(dvc_cmd)
|
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|
||||||
"""Simulate a CLI help prompt using the info available in the project config.
|
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
|
||||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
|
||||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
|
||||||
and a list of available commands is printed.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
setup_check_dvc(project_dir, config)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
if subcommand:
|
|
||||||
validate_subcommand(commands.keys(), subcommand)
|
|
||||||
print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
|
|
||||||
help_text = commands[subcommand].get("help")
|
|
||||||
if help_text:
|
|
||||||
msg.text(f"\n{help_text}\n")
|
|
||||||
else:
|
|
||||||
print(f"\nAvailable commands in {CONFIG_FILE}")
|
|
||||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
|
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
|
||||||
msg.text("Run all commands defined in the 'run' block of the project config:")
|
|
||||||
print(f"{COMMAND} project run-all {project_dir}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
|
||||||
"""Run a named script defined in the project config. If the script is part
|
|
||||||
of the default pipeline (defined in the "run" section), DVC is used to
|
|
||||||
execute the command, so it can determine whether to rerun it. It then
|
|
||||||
calls into "exec" to execute it.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
subcommand (str): Name of command to run.
|
|
||||||
*dvc_args: Other arguments passed to "dvc repro".
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
setup_check_dvc(project_dir, config)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
validate_subcommand(commands.keys(), subcommand)
|
|
||||||
if subcommand in config.get("run", []):
|
|
||||||
# This is one of the pipeline commands tracked in DVC
|
|
||||||
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_command(dvc_cmd)
|
|
||||||
else:
|
|
||||||
cmd = commands[subcommand]
|
|
||||||
# Deps in non-DVC commands aren't tracked, but if they're defined,
|
|
||||||
# make sure they exist before running the command
|
|
||||||
for dep in cmd.get("deps", []):
|
|
||||||
if not (project_dir / dep).exists():
|
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_commands(cmd["script"], variables)
|
|
||||||
|
|
||||||
|
|
||||||
def project_exec(project_dir: Path, subcommand: str):
|
|
||||||
"""Execute a command defined in the project config.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
subcommand (str): Name of command to run.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_commands(commands[subcommand]["script"], variables)
|
|
||||||
|
|
||||||
|
|
||||||
###########
|
|
||||||
# HELPERS #
|
|
||||||
###########
|
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
|
||||||
"""Load the project config file from a directory and validate it.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
RETURNS (Dict[str, Any]): The loaded project config.
|
|
||||||
"""
|
|
||||||
config_path = path / CONFIG_FILE
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail("Can't find project config", config_path, exits=1)
|
|
||||||
invalid_err = f"Invalid project config in {CONFIG_FILE}"
|
|
||||||
try:
|
|
||||||
config = srsly.read_yaml(config_path)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(invalid_err, e, exits=1)
|
|
||||||
errors = validate(ProjectConfigSchema, config)
|
|
||||||
if errors:
|
|
||||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def update_dvc_config(
|
|
||||||
path: Path,
|
|
||||||
config: Dict[str, Any],
|
|
||||||
verbose: bool = False,
|
|
||||||
silent: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
|
||||||
project directory. The file is auto-generated based on the config. The
|
|
||||||
first line of the auto-generated file specifies the hash of the config
|
|
||||||
dict, so if any of the config values change, the DVC config is regenerated.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
config (Dict[str, Any]): The loaded project config.
|
|
||||||
verbose (bool): Whether to print additional info (via DVC).
|
|
||||||
silent (bool): Don't output anything (via DVC).
|
|
||||||
force (bool): Force update, even if hashes match.
|
|
||||||
RETURNS (bool): Whether the DVC config file was updated.
|
|
||||||
"""
|
|
||||||
config_hash = get_hash(config)
|
|
||||||
path = path.resolve()
|
|
||||||
dvc_config_path = path / DVC_CONFIG
|
|
||||||
if dvc_config_path.exists():
|
|
||||||
# Check if the file was generated using the current config, if not, redo
|
|
||||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
|
||||||
ref_hash = f.readline().strip().replace("# ", "")
|
|
||||||
if ref_hash == config_hash and not force:
|
|
||||||
return False # Nothing has changed in project config, don't need to update
|
|
||||||
dvc_config_path.unlink()
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = []
|
|
||||||
# We only want to include commands that are part of the main list of "run"
|
|
||||||
# commands in project.yml and should be run in sequence
|
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
|
||||||
for name in config.get("run", []):
|
|
||||||
validate_subcommand(config_commands.keys(), name)
|
|
||||||
command = config_commands[name]
|
|
||||||
deps = command.get("deps", [])
|
|
||||||
outputs = command.get("outputs", [])
|
|
||||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
|
||||||
if not deps and not outputs and not outputs_no_cache:
|
|
||||||
continue
|
|
||||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
|
||||||
# and we don't want arbitrary paths in there
|
|
||||||
project_cmd = ["python", "-m", NAME, "project", "exec", name]
|
|
||||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
|
||||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
|
||||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
|
||||||
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
|
|
||||||
if verbose:
|
|
||||||
dvc_cmd.append("--verbose")
|
|
||||||
if silent:
|
|
||||||
dvc_cmd.append("--quiet")
|
|
||||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
|
||||||
commands.append(" ".join(full_cmd))
|
|
||||||
with working_dir(path):
|
|
||||||
run_commands(commands, variables, silent=True)
|
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
|
||||||
content = f.read()
|
|
||||||
f.seek(0, 0)
|
|
||||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_dvc() -> None:
|
|
||||||
"""Ensure that the "dvc" command is available and show an error if not."""
|
|
||||||
try:
|
|
||||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
|
||||||
except Exception:
|
|
||||||
msg.fail(
|
|
||||||
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
|
|
||||||
"You can install the Python package from pip (pip install dvc) or "
|
|
||||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
|
||||||
"documentation: https://dvc.org/doc/install",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
|
||||||
"""Check that the project is set up correctly with DVC and update its
|
|
||||||
config if needed. Will raise an error if the project is not an initialized
|
|
||||||
DVC project.
|
|
||||||
|
|
||||||
project_dir (Path): The path to the project directory.
|
|
||||||
config (Dict[str, Any]): The loaded project config.
|
|
||||||
"""
|
|
||||||
if not project_dir.exists():
|
|
||||||
msg.fail(f"Can't find project directory: {project_dir}")
|
|
||||||
if not (project_dir / ".dvc").exists():
|
|
||||||
msg.fail(
|
|
||||||
"Project not initialized as a DVC project.",
|
|
||||||
f"Make sure that the project template was cloned correctly. To "
|
|
||||||
f"initialize the project directory manually, you can run: "
|
|
||||||
f"{COMMAND} project init {project_dir}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
with msg.loading("Updating DVC config..."):
|
|
||||||
updated = update_dvc_config(project_dir, config, silent=True)
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
|
||||||
|
|
||||||
|
|
||||||
def run_commands(
|
|
||||||
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
|
||||||
) -> None:
|
|
||||||
"""Run a sequence of commands in a subprocess, in order.
|
|
||||||
|
|
||||||
commands (List[str]): The string commands.
|
|
||||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
|
||||||
values. Will be used to substitute format string variables in the
|
|
||||||
commands.
|
|
||||||
silent (bool): Don't print the commands.
|
|
||||||
"""
|
|
||||||
for command in commands:
|
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
|
||||||
command = command.format(**variables)
|
|
||||||
command = split_command(command)
|
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
|
||||||
# use commands in their config that reference "python" and we want to
|
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
|
||||||
# executed with and the pip in the same env, not some other Python/pip.
|
|
||||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
|
||||||
# that's how it's set up on their system), and user 2 without the
|
|
||||||
# shortcut tries to re-run the command.
|
|
||||||
if len(command) and command[0] in ("python", "python3"):
|
|
||||||
command[0] = sys.executable
|
|
||||||
elif len(command) and command[0] in ("pip", "pip3"):
|
|
||||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
|
||||||
if not silent:
|
|
||||||
print(f"Running command: {' '.join(command)}")
|
|
||||||
run_command(command)
|
|
||||||
|
|
||||||
|
|
||||||
def convert_asset_url(url: str) -> str:
|
|
||||||
"""Check and convert the asset URL if needed.
|
|
||||||
|
|
||||||
url (str): The asset URL.
|
|
||||||
RETURNS (str): The converted URL.
|
|
||||||
"""
|
|
||||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
|
||||||
if re.match("(http(s?)):\/\/github.com", url):
|
|
||||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
|
||||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
|
||||||
msg.warn(
|
|
||||||
"Downloading from a regular GitHub URL. This will only download "
|
|
||||||
"the source of the page, not the actual file. Converting the URL "
|
|
||||||
"to a raw URL.",
|
|
||||||
converted,
|
|
||||||
)
|
|
||||||
return converted
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
|
||||||
"""Check and validate that the destination path can be used to clone. Will
|
|
||||||
check that Git is available and that the destination path is suitable.
|
|
||||||
|
|
||||||
name (str): Name of the directory to clone from the repo.
|
|
||||||
dest (Path): Local destination of cloned directory.
|
|
||||||
repo (str): URL of the repo to clone from.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
|
||||||
except Exception:
|
|
||||||
msg.fail(
|
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
|
||||||
f"To clone a project without Git, copy the files from the '{name}' "
|
|
||||||
f"directory in the {repo} to {dest} manually and then run:",
|
|
||||||
f"{COMMAND} project init {dest}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not dest:
|
|
||||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
|
||||||
if dest.exists():
|
|
||||||
# Directory already exists (not allowed, clone needs to create it)
|
|
||||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
# We're not creating parents, parent dir should exist
|
|
||||||
msg.fail(
|
|
||||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
|
|
||||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
|
||||||
|
|
||||||
commands (Sequence[str]): The available commands.
|
|
||||||
subcommand (str): The subcommand.
|
|
||||||
"""
|
|
||||||
if subcommand not in commands:
|
|
||||||
msg.fail(
|
|
||||||
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
|
|
||||||
f"Available commands: {', '.join(commands)}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
|
||||||
"""Download a file using requests.
|
|
||||||
|
|
||||||
url (str): The URL of the file.
|
|
||||||
dest (Path): The destination path.
|
|
||||||
chunk_size (int): The size of chunks to read/write.
|
|
||||||
"""
|
|
||||||
response = requests.get(url, stream=True)
|
|
||||||
response.raise_for_status()
|
|
||||||
total = int(response.headers.get("content-length", 0))
|
|
||||||
progress_settings = {
|
|
||||||
"total": total,
|
|
||||||
"unit": "iB",
|
|
||||||
"unit_scale": True,
|
|
||||||
"unit_divisor": chunk_size,
|
|
||||||
"leave": False,
|
|
||||||
}
|
|
||||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
|
||||||
for data in response.iter_content(chunk_size=chunk_size):
|
|
||||||
size = f.write(data)
|
|
||||||
bar.update(size)
|
|
0
spacy/cli/project/__init__.py
Normal file
0
spacy/cli/project/__init__.py
Normal file
154
spacy/cli/project/assets.py
Normal file
154
spacy/cli/project/assets.py
Normal file
|
@ -0,0 +1,154 @@
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import requests
|
||||||
|
import tqdm
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from ...util import ensure_path, get_checksum, working_dir
|
||||||
|
from .._app import project_cli, Arg
|
||||||
|
from .util import PROJECT_FILE, load_project_config
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: find a solution for caches
|
||||||
|
# CACHES = [
|
||||||
|
# Path.home() / ".torch",
|
||||||
|
# Path.home() / ".caches" / "torch",
|
||||||
|
# os.environ.get("TORCH_HOME"),
|
||||||
|
# Path.home() / ".keras",
|
||||||
|
# ]
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("assets")
|
||||||
|
def project_assets_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||||
|
defined in the "assets" section of the project.yml. If a checksum is
|
||||||
|
provided in the project.yml, the file is only downloaded if no local file
|
||||||
|
with the same checksum exists.
|
||||||
|
"""
|
||||||
|
project_assets(project_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def project_assets(project_dir: Path) -> None:
|
||||||
|
"""Fetch assets for a project using DVC if possible.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
"""
|
||||||
|
project_path = ensure_path(project_dir)
|
||||||
|
config = load_project_config(project_path)
|
||||||
|
assets = config.get("assets", {})
|
||||||
|
if not assets:
|
||||||
|
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||||
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
for asset in assets:
|
||||||
|
dest = asset["dest"].format(**variables)
|
||||||
|
url = asset.get("url")
|
||||||
|
checksum = asset.get("checksum")
|
||||||
|
if not url:
|
||||||
|
# project.yml defines asset without URL that the user has to place
|
||||||
|
check_private_asset(dest, checksum)
|
||||||
|
continue
|
||||||
|
url = url.format(**variables)
|
||||||
|
fetch_asset(project_path, url, dest, checksum)
|
||||||
|
|
||||||
|
|
||||||
|
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||||
|
"""Check and validate assets without a URL (private assets that the user
|
||||||
|
has to provide themselves) and give feedback about the checksum.
|
||||||
|
|
||||||
|
dest (Path): Desintation path of the asset.
|
||||||
|
checksum (Optional[str]): Optional checksum of the expected file.
|
||||||
|
"""
|
||||||
|
if not Path(dest).exists():
|
||||||
|
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||||
|
msg.warn(err)
|
||||||
|
else:
|
||||||
|
if checksum and checksum == get_checksum(dest):
|
||||||
|
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||||
|
else:
|
||||||
|
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_asset(
|
||||||
|
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||||
|
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||||
|
|
||||||
|
project_path (Path): Path to project directory.
|
||||||
|
url (str): URL or path to asset.
|
||||||
|
checksum (Optional[str]): Optional expected checksum of local file.
|
||||||
|
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||||
|
the asset failed.
|
||||||
|
"""
|
||||||
|
# TODO: add support for caches
|
||||||
|
dest_path = (project_path / dest).resolve()
|
||||||
|
if dest_path.exists() and checksum:
|
||||||
|
# If there's already a file, check for checksum
|
||||||
|
if checksum == get_checksum(dest_path):
|
||||||
|
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||||
|
return dest_path
|
||||||
|
with working_dir(project_path):
|
||||||
|
url = convert_asset_url(url)
|
||||||
|
try:
|
||||||
|
download_file(url, dest_path)
|
||||||
|
msg.good(f"Downloaded asset {dest}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
if Path(url).exists() and Path(url).is_file():
|
||||||
|
# If it's a local file, copy to destination
|
||||||
|
shutil.copy(url, str(dest_path))
|
||||||
|
msg.good(f"Copied local asset {dest}")
|
||||||
|
else:
|
||||||
|
msg.fail(f"Download failed: {dest}", e)
|
||||||
|
return
|
||||||
|
if checksum and checksum != get_checksum(dest_path):
|
||||||
|
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_asset_url(url: str) -> str:
|
||||||
|
"""Check and convert the asset URL if needed.
|
||||||
|
|
||||||
|
url (str): The asset URL.
|
||||||
|
RETURNS (str): The converted URL.
|
||||||
|
"""
|
||||||
|
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||||
|
if re.match(r"(http(s?)):\/\/github.com", url):
|
||||||
|
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||||
|
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||||
|
msg.warn(
|
||||||
|
"Downloading from a regular GitHub URL. This will only download "
|
||||||
|
"the source of the page, not the actual file. Converting the URL "
|
||||||
|
"to a raw URL.",
|
||||||
|
converted,
|
||||||
|
)
|
||||||
|
return converted
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||||
|
"""Download a file using requests.
|
||||||
|
|
||||||
|
url (str): The URL of the file.
|
||||||
|
dest (Path): The destination path.
|
||||||
|
chunk_size (int): The size of chunks to read/write.
|
||||||
|
"""
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
response.raise_for_status()
|
||||||
|
total = int(response.headers.get("content-length", 0))
|
||||||
|
progress_settings = {
|
||||||
|
"total": total,
|
||||||
|
"unit": "iB",
|
||||||
|
"unit_scale": True,
|
||||||
|
"unit_divisor": chunk_size,
|
||||||
|
"leave": False,
|
||||||
|
}
|
||||||
|
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||||
|
for data in response.iter_content(chunk_size=chunk_size):
|
||||||
|
size = f.write(data)
|
||||||
|
bar.update(size)
|
110
spacy/cli/project/clone.py
Normal file
110
spacy/cli/project/clone.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from ... import about
|
||||||
|
from ...util import ensure_path, run_command, make_tempdir
|
||||||
|
from .._app import project_cli, Arg, Opt, COMMAND
|
||||||
|
|
||||||
|
|
||||||
|
DIRS = [
|
||||||
|
"assets",
|
||||||
|
"metas",
|
||||||
|
"configs",
|
||||||
|
"packages",
|
||||||
|
"metrics",
|
||||||
|
"scripts",
|
||||||
|
"notebooks",
|
||||||
|
"training",
|
||||||
|
"corpus",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("clone")
|
||||||
|
def project_clone_cli(
|
||||||
|
# fmt: off
|
||||||
|
name: str = Arg(..., help="The name of the template to fetch"),
|
||||||
|
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
||||||
|
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Clone a project template from a repository. Calls into "git" and will
|
||||||
|
only download the files from the given subdirectory. The GitHub repo
|
||||||
|
defaults to the official spaCy template repo, but can be customized
|
||||||
|
(including using a private repo). Setting the --git flag will also
|
||||||
|
initialize the project directory as a Git repo. If the project is intended
|
||||||
|
to be a Git repo, it should be initialized with Git first, before
|
||||||
|
initializing DVC (Data Version Control). This allows DVC to integrate with
|
||||||
|
Git.
|
||||||
|
"""
|
||||||
|
if dest == Path.cwd():
|
||||||
|
dest = dest / name
|
||||||
|
project_clone(name, dest, repo=repo)
|
||||||
|
|
||||||
|
|
||||||
|
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
|
||||||
|
"""Clone a project template from a repository.
|
||||||
|
|
||||||
|
name (str): Name of subdirectory to clone.
|
||||||
|
dest (Path): Destination path of cloned project.
|
||||||
|
repo (str): URL of Git repo containing project templates.
|
||||||
|
"""
|
||||||
|
dest = ensure_path(dest)
|
||||||
|
check_clone(name, dest, repo)
|
||||||
|
project_dir = dest.resolve()
|
||||||
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||||
|
try:
|
||||||
|
run_command(cmd)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||||
|
msg.fail(err)
|
||||||
|
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||||
|
f.write(name)
|
||||||
|
try:
|
||||||
|
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||||
|
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
err = f"Could not clone '{name}' in the repo '{repo}'."
|
||||||
|
msg.fail(err)
|
||||||
|
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
||||||
|
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
|
||||||
|
for sub_dir in DIRS:
|
||||||
|
dir_path = project_dir / sub_dir
|
||||||
|
if not dir_path.exists():
|
||||||
|
dir_path.mkdir(parents=True)
|
||||||
|
msg.good(f"Your project is now ready!", dest)
|
||||||
|
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||||
|
"""Check and validate that the destination path can be used to clone. Will
|
||||||
|
check that Git is available and that the destination path is suitable.
|
||||||
|
|
||||||
|
name (str): Name of the directory to clone from the repo.
|
||||||
|
dest (Path): Local destination of cloned directory.
|
||||||
|
repo (str): URL of the repo to clone from.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||||
|
f"To clone a project without Git, copy the files from the '{name}' "
|
||||||
|
f"directory in the {repo} to {dest} manually and then run:",
|
||||||
|
f"{COMMAND} project init {dest}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not dest:
|
||||||
|
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||||
|
if dest.exists():
|
||||||
|
# Directory already exists (not allowed, clone needs to create it)
|
||||||
|
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||||
|
if not dest.parent.exists():
|
||||||
|
# We're not creating parents, parent dir should exist
|
||||||
|
msg.fail(
|
||||||
|
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||||
|
exits=1,
|
||||||
|
)
|
206
spacy/cli/project/dvc.py
Normal file
206
spacy/cli/project/dvc.py
Normal file
|
@ -0,0 +1,206 @@
|
||||||
|
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||||
|
with Data Version Controk (DVC). https://dvc.org"""
|
||||||
|
from typing import Dict, Any, List, Optional
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from .util import PROJECT_FILE, load_project_config
|
||||||
|
from .._app import project_cli, Arg, Opt, NAME, COMMAND
|
||||||
|
from ...util import get_hash, working_dir, split_command, join_command, run_command
|
||||||
|
|
||||||
|
|
||||||
|
DVC_CONFIG = "dvc.yaml"
|
||||||
|
DVC_DIR = ".dvc"
|
||||||
|
UPDATE_COMMAND = "dvc"
|
||||||
|
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||||
|
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||||
|
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command(UPDATE_COMMAND)
|
||||||
|
def project_update_dvc_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||||
|
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||||
|
project can only define one pipeline, so you need to specify one workflow
|
||||||
|
defined in the project.yml. If no workflow is specified, the first defined
|
||||||
|
workflow is used. The DVC config will only be updated if
|
||||||
|
"""
|
||||||
|
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||||
|
|
||||||
|
|
||||||
|
def project_update_dvc(
|
||||||
|
project_dir: Path,
|
||||||
|
workflow: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
verbose: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||||
|
project can only define one pipeline, so you need to specify one workflow
|
||||||
|
defined in the project.yml. Will only update the file if the checksum changed.
|
||||||
|
|
||||||
|
project_dir (Path): The project directory.
|
||||||
|
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||||
|
If not set, the first workflow will be used.
|
||||||
|
verbose (bool): Print more info.
|
||||||
|
force (bool): Force update DVC config.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
updated = update_dvc_config(
|
||||||
|
project_dir, config, workflow, verbose=verbose, force=force
|
||||||
|
)
|
||||||
|
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||||
|
if updated:
|
||||||
|
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||||
|
else:
|
||||||
|
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def update_dvc_config(
|
||||||
|
path: Path,
|
||||||
|
config: Dict[str, Any],
|
||||||
|
workflow: Optional[str] = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
silent: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
) -> bool:
|
||||||
|
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||||
|
project directory. The file is auto-generated based on the config. The
|
||||||
|
first line of the auto-generated file specifies the hash of the config
|
||||||
|
dict, so if any of the config values change, the DVC config is regenerated.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
config (Dict[str, Any]): The loaded project.yml.
|
||||||
|
verbose (bool): Whether to print additional info (via DVC).
|
||||||
|
silent (bool): Don't output anything (via DVC).
|
||||||
|
force (bool): Force update, even if hashes match.
|
||||||
|
RETURNS (bool): Whether the DVC config file was updated.
|
||||||
|
"""
|
||||||
|
ensure_dvc(path)
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
workflow_names = list(workflows.keys())
|
||||||
|
check_workflows(workflow_names, workflow)
|
||||||
|
if not workflow:
|
||||||
|
workflow = workflow_names[0]
|
||||||
|
config_hash = get_hash(config)
|
||||||
|
path = path.resolve()
|
||||||
|
dvc_config_path = path / DVC_CONFIG
|
||||||
|
if dvc_config_path.exists():
|
||||||
|
# Check if the file was generated using the current config, if not, redo
|
||||||
|
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||||
|
ref_hash = f.readline().strip().replace("# ", "")
|
||||||
|
if ref_hash == config_hash and not force:
|
||||||
|
return False # Nothing has changed in project.yml, don't need to update
|
||||||
|
dvc_config_path.unlink()
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
dvc_commands = []
|
||||||
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
for name in workflows[workflow]:
|
||||||
|
command = config_commands[name]
|
||||||
|
deps = command.get("deps", [])
|
||||||
|
outputs = command.get("outputs", [])
|
||||||
|
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||||
|
if not deps and not outputs and not outputs_no_cache:
|
||||||
|
continue
|
||||||
|
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||||
|
# and we don't want arbitrary paths in there
|
||||||
|
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||||
|
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||||
|
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||||
|
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||||
|
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||||
|
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||||
|
dvc_commands.append(join_command(full_cmd))
|
||||||
|
with working_dir(path):
|
||||||
|
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||||
|
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||||
|
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||||
|
content = f.read()
|
||||||
|
f.seek(0, 0)
|
||||||
|
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def run_dvc_commands(
|
||||||
|
commands: List[str] = tuple(),
|
||||||
|
variables: Dict[str, str] = {},
|
||||||
|
flags: Dict[str, bool] = {},
|
||||||
|
) -> None:
|
||||||
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The string commands without the leading "dvc".
|
||||||
|
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||||
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
|
"""
|
||||||
|
for command in commands:
|
||||||
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
|
command = command.format(**variables)
|
||||||
|
command = split_command(command)
|
||||||
|
dvc_command = ["dvc", *command]
|
||||||
|
# Add the flags if they are set to True
|
||||||
|
for flag, is_active in flags.items():
|
||||||
|
if is_active:
|
||||||
|
dvc_command.append(flag)
|
||||||
|
run_command(dvc_command)
|
||||||
|
|
||||||
|
|
||||||
|
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||||
|
"""Validate workflows provided in project.yml and check that a given
|
||||||
|
workflow can be used to generate a DVC config.
|
||||||
|
|
||||||
|
workflows (List[str]): Names of the available workflows.
|
||||||
|
workflow (Optional[str]): The name of the workflow to convert.
|
||||||
|
"""
|
||||||
|
if not workflows:
|
||||||
|
msg.fail(
|
||||||
|
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||||
|
f"define at least one list of commands.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if workflow is not None and workflow not in workflows:
|
||||||
|
msg.fail(
|
||||||
|
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||||
|
f"Available workflows: {', '.join(workflows)}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not workflow:
|
||||||
|
msg.warn(
|
||||||
|
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||||
|
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dvc(project_dir: Path) -> None:
|
||||||
|
"""Ensure that the "dvc" command is available and that the current project
|
||||||
|
directory is an initialized DVC project.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||||
|
"to be installed and the 'dvc' command needs to be available",
|
||||||
|
"You can install the Python package from pip (pip install dvc) or "
|
||||||
|
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||||
|
"documentation: https://dvc.org/doc/install",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not (project_dir / ".dvc").exists():
|
||||||
|
msg.fail(
|
||||||
|
"Project not initialized as a DVC project",
|
||||||
|
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||||
|
"directory. For more details, see the documentation: "
|
||||||
|
"https://dvc.org/doc/command-reference/init",
|
||||||
|
exits=1,
|
||||||
|
)
|
250
spacy/cli/project/run.py
Normal file
250
spacy/cli/project/run.py
Normal file
|
@ -0,0 +1,250 @@
|
||||||
|
from typing import Optional, List, Dict, Sequence, Any
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import typer
|
||||||
|
import sys
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from ...util import working_dir, run_command, split_command, is_cwd, get_checksum
|
||||||
|
from ...util import get_hash, join_command
|
||||||
|
from .._app import project_cli, Arg, Opt, COMMAND
|
||||||
|
from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command(
|
||||||
|
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def project_run_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||||
|
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
|
||||||
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Run a named script or workflow defined in the project.yml. If a workflow
|
||||||
|
name is specified, all commands in the workflow are run, in order. If
|
||||||
|
commands define inputs and/or outputs, they will only be re-run if state
|
||||||
|
has changed.
|
||||||
|
"""
|
||||||
|
if show_help or not subcommand:
|
||||||
|
print_run_help(project_dir, subcommand)
|
||||||
|
else:
|
||||||
|
project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry)
|
||||||
|
|
||||||
|
|
||||||
|
def project_run(
|
||||||
|
project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""Run a named script defined in the project.yml. If the script is part
|
||||||
|
of the default pipeline (defined in the "run" section), DVC is used to
|
||||||
|
execute the command, so it can determine whether to rerun it. It then
|
||||||
|
calls into "exec" to execute it.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
subcommand (str): Name of command to run.
|
||||||
|
force (bool): Force re-running, even if nothing changed.
|
||||||
|
dry (bool): Perform a dry run and don't execute commands.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||||
|
if subcommand in workflows:
|
||||||
|
msg.info(f"Running workflow '{subcommand}'")
|
||||||
|
for cmd in workflows[subcommand]:
|
||||||
|
project_run(project_dir, cmd, force=force, dry=dry)
|
||||||
|
else:
|
||||||
|
cmd = commands[subcommand]
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
for dep in cmd.get("deps", []):
|
||||||
|
dep = dep.format(**variables)
|
||||||
|
if not (project_dir / dep).exists():
|
||||||
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
|
msg.fail(err, **err_kwargs)
|
||||||
|
with working_dir(project_dir) as current_dir:
|
||||||
|
rerun = check_rerun(current_dir, cmd, variables)
|
||||||
|
if not rerun and not force:
|
||||||
|
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||||
|
else:
|
||||||
|
msg.divider(subcommand)
|
||||||
|
run_commands(cmd["script"], variables, dry=dry)
|
||||||
|
update_lockfile(current_dir, cmd, variables)
|
||||||
|
|
||||||
|
|
||||||
|
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
|
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||||
|
|
||||||
|
project_dir (Path): The project directory.
|
||||||
|
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||||
|
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||||
|
and a list of available commands is printed.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
config_commands = config.get("commands", [])
|
||||||
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
|
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||||
|
if subcommand:
|
||||||
|
validate_subcommand(commands.keys(), subcommand)
|
||||||
|
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||||
|
help_text = commands[subcommand].get("help")
|
||||||
|
if help_text:
|
||||||
|
msg.text(f"\n{help_text}\n")
|
||||||
|
else:
|
||||||
|
print(f"\nAvailable commands in {PROJECT_FILE}")
|
||||||
|
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||||
|
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||||
|
msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
|
||||||
|
print(f"{COMMAND} project run {project_loc}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_commands(
|
||||||
|
commands: List[str] = tuple(),
|
||||||
|
variables: Dict[str, Any] = {},
|
||||||
|
silent: bool = False,
|
||||||
|
dry: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Run a sequence of commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The string commands.
|
||||||
|
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
silent (bool): Don't print the commands.
|
||||||
|
dry (bool): Perform a dry run and don't execut anything.
|
||||||
|
"""
|
||||||
|
for command in commands:
|
||||||
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
|
command = command.format(**variables)
|
||||||
|
command = split_command(command)
|
||||||
|
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||||
|
# use commands in their config that reference "python" and we want to
|
||||||
|
# make sure that it's always executing the same Python that spaCy is
|
||||||
|
# executed with and the pip in the same env, not some other Python/pip.
|
||||||
|
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||||
|
# that's how it's set up on their system), and user 2 without the
|
||||||
|
# shortcut tries to re-run the command.
|
||||||
|
if len(command) and command[0] in ("python", "python3"):
|
||||||
|
command[0] = sys.executable
|
||||||
|
elif len(command) and command[0] in ("pip", "pip3"):
|
||||||
|
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||||
|
if not silent:
|
||||||
|
print(f"Running command: {join_command(command)}")
|
||||||
|
if not dry:
|
||||||
|
run_command(command)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_subcommand(
|
||||||
|
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||||
|
) -> None:
|
||||||
|
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||||
|
|
||||||
|
commands (Sequence[str]): The available commands.
|
||||||
|
subcommand (str): The subcommand.
|
||||||
|
"""
|
||||||
|
if not commands and not workflows:
|
||||||
|
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||||
|
if subcommand not in commands and subcommand not in workflows:
|
||||||
|
help_msg = []
|
||||||
|
if commands:
|
||||||
|
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||||
|
if workflows:
|
||||||
|
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||||
|
msg.fail(
|
||||||
|
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||||
|
". ".join(help_msg),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def check_rerun(
|
||||||
|
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||||
|
) -> bool:
|
||||||
|
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||||
|
changed.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
RETURNS (bool): Whether to re-run the command.
|
||||||
|
"""
|
||||||
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||||
|
return True
|
||||||
|
data = srsly.read_yaml(lock_path)
|
||||||
|
if command["name"] not in data: # We don't have info about this command
|
||||||
|
return True
|
||||||
|
entry = data[command["name"]]
|
||||||
|
# If the entry in the lockfile matches the lockfile entry that would be
|
||||||
|
# generated from the current command, we don't rerun because it means that
|
||||||
|
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||||
|
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lockfile(
|
||||||
|
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
"""Update the lockfile after running a command. Will create a lockfile if
|
||||||
|
it doesn't yet exist and will add an entry for the current command, its
|
||||||
|
script and dependencies/outputs.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
"""
|
||||||
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
if not lock_path.exists():
|
||||||
|
srsly.write_yaml(lock_path, {})
|
||||||
|
data = {}
|
||||||
|
else:
|
||||||
|
data = srsly.read_yaml(lock_path)
|
||||||
|
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
||||||
|
srsly.write_yaml(lock_path, data)
|
||||||
|
|
||||||
|
|
||||||
|
def get_lock_entry(
|
||||||
|
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||||
|
the script (command steps) and a list of dependencies and outputs with
|
||||||
|
their paths and file hashes, if available. The format is based on the
|
||||||
|
dvc.lock files, to keep things consistent.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||||
|
"""
|
||||||
|
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
||||||
|
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
||||||
|
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
||||||
|
return {
|
||||||
|
"cmd": f"{COMMAND} run {command['name']}",
|
||||||
|
"script": command["script"],
|
||||||
|
"deps": deps,
|
||||||
|
"outs": [*outs, *outs_nc],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_fileinfo(
|
||||||
|
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||||
|
Includes the file path and the file's checksum.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
paths (List[str]): The file paths.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
for path in paths:
|
||||||
|
path = path.format(**variables)
|
||||||
|
file_path = project_dir / path
|
||||||
|
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||||
|
data.append({"path": path, "md5": md5})
|
||||||
|
return data
|
57
spacy/cli/project/util.py
Normal file
57
spacy/cli/project/util.py
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
from typing import Dict, Any
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from ...schemas import ProjectConfigSchema, validate
|
||||||
|
|
||||||
|
|
||||||
|
PROJECT_FILE = "project.yml"
|
||||||
|
PROJECT_LOCK = "project.lock"
|
||||||
|
|
||||||
|
|
||||||
|
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
|
"""Load the project.yml file from a directory and validate it.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||||
|
"""
|
||||||
|
config_path = path / PROJECT_FILE
|
||||||
|
if not config_path.exists():
|
||||||
|
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||||
|
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||||
|
try:
|
||||||
|
config = srsly.read_yaml(config_path)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(invalid_err, e, exits=1)
|
||||||
|
errors = validate(ProjectConfigSchema, config)
|
||||||
|
if errors:
|
||||||
|
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||||
|
validate_project_commands(config)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||||
|
if duplicates:
|
||||||
|
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
for workflow_name, workflow_steps in workflows.items():
|
||||||
|
if workflow_name in command_names:
|
||||||
|
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
for step in workflow_steps:
|
||||||
|
if step not in command_names:
|
||||||
|
msg.fail(
|
||||||
|
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||||
|
f"Workflows can only refer to commands defined in the 'commands' "
|
||||||
|
f"section of the {PROJECT_FILE}.",
|
||||||
|
exits=1,
|
||||||
|
)
|
|
@ -203,7 +203,8 @@ def train(
|
||||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
nlp, shuffle=False, gold_preproc=training["gold_preproc"]
|
nlp, shuffle=False, gold_preproc=training["gold_preproc"],
|
||||||
|
max_length=training["max_length"]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
nlp.begin_training(lambda: train_examples)
|
nlp.begin_training(lambda: train_examples)
|
||||||
|
@ -306,11 +307,18 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
if len(train_examples) == 0:
|
if len(train_examples) == 0:
|
||||||
raise ValueError(Errors.E988)
|
raise ValueError(Errors.E988)
|
||||||
epoch += 1
|
epoch += 1
|
||||||
batches = util.minibatch_by_words(
|
if cfg.get("batch_by_words", True):
|
||||||
train_examples,
|
batches = util.minibatch_by_words(
|
||||||
size=cfg["batch_size"],
|
train_examples,
|
||||||
discard_oversize=cfg["discard_oversize"],
|
size=cfg["batch_size"],
|
||||||
)
|
discard_oversize=cfg["discard_oversize"],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
batches = util.minibatch(
|
||||||
|
train_examples,
|
||||||
|
size=cfg["batch_size"],
|
||||||
|
)
|
||||||
|
|
||||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||||
try:
|
try:
|
||||||
first = next(batches)
|
first = next(batches)
|
||||||
|
|
|
@ -477,15 +477,14 @@ class Errors(object):
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||||
"array and {doc_length} for the Doc itself.")
|
"array and {doc_length} for the Doc itself.")
|
||||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||||
E973 = ("Unexpected type for NER data")
|
E973 = ("Unexpected type for NER data")
|
||||||
E974 = ("Unknown {obj} attribute: {key}")
|
E974 = ("Unknown {obj} attribute: {key}")
|
||||||
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
|
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||||
"but got {type}")
|
|
||||||
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
|
|
||||||
"but received None.")
|
"but received None.")
|
||||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .corpus import Corpus
|
from .corpus import Corpus
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from .align import align
|
from .align import Alignment
|
||||||
|
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob
|
from .iob_utils import iob_to_biluo, biluo_to_iob
|
||||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
cdef class Alignment:
|
|
||||||
cdef public object cost
|
|
||||||
cdef public object i2j
|
|
||||||
cdef public object j2i
|
|
||||||
cdef public object i2j_multi
|
|
||||||
cdef public object j2i_multi
|
|
||||||
cdef public object cand_to_gold
|
|
||||||
cdef public object gold_to_cand
|
|
30
spacy/gold/align.py
Normal file
30
spacy/gold/align.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
from typing import List
|
||||||
|
import numpy
|
||||||
|
from thinc.types import Ragged
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import tokenizations
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Alignment:
|
||||||
|
x2y: Ragged
|
||||||
|
y2x: Ragged
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
|
||||||
|
x2y = _make_ragged(x2y)
|
||||||
|
y2x = _make_ragged(y2x)
|
||||||
|
return Alignment(x2y=x2y, y2x=y2x)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
||||||
|
x2y, y2x = tokenizations.get_alignments(A, B)
|
||||||
|
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_ragged(indices):
|
||||||
|
lengths = numpy.array([len(x) for x in indices], dtype="i")
|
||||||
|
flat = []
|
||||||
|
for x in indices:
|
||||||
|
flat.extend(x)
|
||||||
|
return Ragged(numpy.array(flat, dtype="i"), lengths)
|
|
@ -1,101 +0,0 @@
|
||||||
import numpy
|
|
||||||
from ..errors import Errors, AlignmentError
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Alignment:
|
|
||||||
def __init__(self, spacy_words, gold_words):
|
|
||||||
# Do many-to-one alignment for misaligned tokens.
|
|
||||||
# If we over-segment, we'll have one gold word that covers a sequence
|
|
||||||
# of predicted words
|
|
||||||
# If we under-segment, we'll have one predicted word that covers a
|
|
||||||
# sequence of gold words.
|
|
||||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
|
||||||
# a sequence of gold words. That's many-to-many -- we don't do that
|
|
||||||
# except for NER spans where the start and end can be aligned.
|
|
||||||
cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
|
|
||||||
self.cost = cost
|
|
||||||
self.i2j = i2j
|
|
||||||
self.j2i = j2i
|
|
||||||
self.i2j_multi = i2j_multi
|
|
||||||
self.j2i_multi = j2i_multi
|
|
||||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
|
||||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
|
||||||
|
|
||||||
|
|
||||||
def align(tokens_a, tokens_b):
|
|
||||||
"""Calculate alignment tables between two tokenizations.
|
|
||||||
|
|
||||||
tokens_a (List[str]): The candidate tokenization.
|
|
||||||
tokens_b (List[str]): The reference tokenization.
|
|
||||||
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
|
||||||
* cost (int): The number of misaligned tokens.
|
|
||||||
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
|
||||||
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
|
||||||
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
|
||||||
it has the value -1.
|
|
||||||
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
|
||||||
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
|
||||||
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
|
||||||
the same token of `tokens_b`.
|
|
||||||
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
|
||||||
direction.
|
|
||||||
"""
|
|
||||||
tokens_a = _normalize_for_alignment(tokens_a)
|
|
||||||
tokens_b = _normalize_for_alignment(tokens_b)
|
|
||||||
cost = 0
|
|
||||||
a2b = numpy.empty(len(tokens_a), dtype="i")
|
|
||||||
b2a = numpy.empty(len(tokens_b), dtype="i")
|
|
||||||
a2b.fill(-1)
|
|
||||||
b2a.fill(-1)
|
|
||||||
a2b_multi = {}
|
|
||||||
b2a_multi = {}
|
|
||||||
i = 0
|
|
||||||
j = 0
|
|
||||||
offset_a = 0
|
|
||||||
offset_b = 0
|
|
||||||
while i < len(tokens_a) and j < len(tokens_b):
|
|
||||||
a = tokens_a[i][offset_a:]
|
|
||||||
b = tokens_b[j][offset_b:]
|
|
||||||
if a == b:
|
|
||||||
if offset_a == offset_b == 0:
|
|
||||||
a2b[i] = j
|
|
||||||
b2a[j] = i
|
|
||||||
elif offset_a == 0:
|
|
||||||
cost += 2
|
|
||||||
a2b_multi[i] = j
|
|
||||||
elif offset_b == 0:
|
|
||||||
cost += 2
|
|
||||||
b2a_multi[j] = i
|
|
||||||
offset_a = offset_b = 0
|
|
||||||
i += 1
|
|
||||||
j += 1
|
|
||||||
elif a == "":
|
|
||||||
assert offset_a == 0
|
|
||||||
cost += 1
|
|
||||||
i += 1
|
|
||||||
elif b == "":
|
|
||||||
assert offset_b == 0
|
|
||||||
cost += 1
|
|
||||||
j += 1
|
|
||||||
elif b.startswith(a):
|
|
||||||
cost += 1
|
|
||||||
if offset_a == 0:
|
|
||||||
a2b_multi[i] = j
|
|
||||||
i += 1
|
|
||||||
offset_a = 0
|
|
||||||
offset_b += len(a)
|
|
||||||
elif a.startswith(b):
|
|
||||||
cost += 1
|
|
||||||
if offset_b == 0:
|
|
||||||
b2a_multi[j] = i
|
|
||||||
j += 1
|
|
||||||
offset_b = 0
|
|
||||||
offset_a += len(b)
|
|
||||||
else:
|
|
||||||
assert "".join(tokens_a) != "".join(tokens_b)
|
|
||||||
raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
|
|
||||||
return cost, a2b, b2a, a2b_multi, b2a_multi
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_for_alignment(tokens):
|
|
||||||
return [w.replace(" ", "").lower() for w in tokens]
|
|
|
@ -1,8 +1,7 @@
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from .align cimport Alignment
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
cdef readonly Doc x
|
cdef readonly Doc x
|
||||||
cdef readonly Doc y
|
cdef readonly Doc y
|
||||||
cdef readonly Alignment _alignment
|
cdef readonly object _alignment
|
||||||
|
|
|
@ -6,10 +6,9 @@ from ..tokens.doc cimport Doc
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.span import Span
|
from ..tokens.span import Span
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from .align cimport Alignment
|
from .align import Alignment
|
||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||||
from .iob_utils import spans_from_biluo_tags
|
from .iob_utils import spans_from_biluo_tags
|
||||||
from .align import Alignment
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
|
|
||||||
|
@ -28,8 +27,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||||
if reference is None:
|
if reference is None:
|
||||||
|
@ -60,17 +58,15 @@ cdef class Example:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, Doc predicted, dict example_dict):
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
|
if predicted is None:
|
||||||
|
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||||
if example_dict is None:
|
if example_dict is None:
|
||||||
raise ValueError(Errors.E976)
|
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||||
if not isinstance(predicted, Doc):
|
|
||||||
raise TypeError(Errors.E975.format(type=type(predicted)))
|
|
||||||
example_dict = _fix_legacy_dict_data(example_dict)
|
example_dict = _fix_legacy_dict_data(example_dict)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
if not _has_field(tok_dict, "SPACY"):
|
|
||||||
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
|
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
|
@ -83,34 +79,38 @@ cdef class Example:
|
||||||
gold_words = [token.orth_ for token in self.reference]
|
gold_words = [token.orth_ for token in self.reference]
|
||||||
if gold_words == []:
|
if gold_words == []:
|
||||||
gold_words = spacy_words
|
gold_words = spacy_words
|
||||||
self._alignment = Alignment(spacy_words, gold_words)
|
self._alignment = Alignment.from_strings(spacy_words, gold_words)
|
||||||
return self._alignment
|
return self._alignment
|
||||||
|
|
||||||
def get_aligned(self, field, as_string=False):
|
def get_aligned(self, field, as_string=False):
|
||||||
"""Return an aligned array for a token attribute."""
|
"""Return an aligned array for a token attribute."""
|
||||||
i2j_multi = self.alignment.i2j_multi
|
align = self.alignment.x2y
|
||||||
cand_to_gold = self.alignment.cand_to_gold
|
|
||||||
|
|
||||||
vocab = self.reference.vocab
|
vocab = self.reference.vocab
|
||||||
gold_values = self.reference.to_array([field])
|
gold_values = self.reference.to_array([field])
|
||||||
output = [None] * len(self.predicted)
|
output = [None] * len(self.predicted)
|
||||||
for i, gold_i in enumerate(cand_to_gold):
|
for token in self.predicted:
|
||||||
if self.predicted[i].text.isspace():
|
if token.is_space:
|
||||||
output[i] = None
|
output[token.i] = None
|
||||||
if gold_i is None:
|
|
||||||
if i in i2j_multi:
|
|
||||||
output[i] = gold_values[i2j_multi[i]]
|
|
||||||
else:
|
|
||||||
output[i] = None
|
|
||||||
else:
|
else:
|
||||||
output[i] = gold_values[gold_i]
|
values = gold_values[align[token.i].dataXd]
|
||||||
|
values = values.ravel()
|
||||||
|
if len(values) == 0:
|
||||||
|
output[token.i] = None
|
||||||
|
elif len(values) == 1:
|
||||||
|
output[token.i] = values[0]
|
||||||
|
elif len(set(list(values))) == 1:
|
||||||
|
# If all aligned tokens have the same value, use it.
|
||||||
|
output[token.i] = values[0]
|
||||||
|
else:
|
||||||
|
output[token.i] = None
|
||||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def get_aligned_parse(self, projectivize=True):
|
def get_aligned_parse(self, projectivize=True):
|
||||||
cand_to_gold = self.alignment.cand_to_gold
|
cand_to_gold = self.alignment.x2y
|
||||||
gold_to_cand = self.alignment.gold_to_cand
|
gold_to_cand = self.alignment.y2x
|
||||||
aligned_heads = [None] * self.x.length
|
aligned_heads = [None] * self.x.length
|
||||||
aligned_deps = [None] * self.x.length
|
aligned_deps = [None] * self.x.length
|
||||||
heads = [token.head.i for token in self.y]
|
heads = [token.head.i for token in self.y]
|
||||||
|
@ -118,52 +118,51 @@ cdef class Example:
|
||||||
if projectivize:
|
if projectivize:
|
||||||
heads, deps = nonproj.projectivize(heads, deps)
|
heads, deps = nonproj.projectivize(heads, deps)
|
||||||
for cand_i in range(self.x.length):
|
for cand_i in range(self.x.length):
|
||||||
gold_i = cand_to_gold[cand_i]
|
if cand_to_gold.lengths[cand_i] == 1:
|
||||||
if gold_i is not None: # Alignment found
|
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
||||||
gold_head = gold_to_cand[heads[gold_i]]
|
if gold_to_cand.lengths[heads[gold_i]] == 1:
|
||||||
if gold_head is not None:
|
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
|
||||||
aligned_heads[cand_i] = gold_head
|
|
||||||
aligned_deps[cand_i] = deps[gold_i]
|
aligned_deps[cand_i] = deps[gold_i]
|
||||||
return aligned_heads, aligned_deps
|
return aligned_heads, aligned_deps
|
||||||
|
|
||||||
|
def get_aligned_spans_x2y(self, x_spans):
|
||||||
|
return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
|
||||||
|
|
||||||
|
def get_aligned_spans_y2x(self, y_spans):
|
||||||
|
return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
|
||||||
|
|
||||||
|
def _get_aligned_spans(self, doc, spans, align):
|
||||||
|
seen = set()
|
||||||
|
output = []
|
||||||
|
for span in spans:
|
||||||
|
indices = align[span.start : span.end].data.ravel()
|
||||||
|
indices = [idx for idx in indices if idx not in seen]
|
||||||
|
if len(indices) >= 1:
|
||||||
|
aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
|
||||||
|
target_text = span.text.lower().strip().replace(" ", "")
|
||||||
|
our_text = aligned_span.text.lower().strip().replace(" ", "")
|
||||||
|
if our_text == target_text:
|
||||||
|
output.append(aligned_span)
|
||||||
|
seen.update(indices)
|
||||||
|
return output
|
||||||
|
|
||||||
def get_aligned_ner(self):
|
def get_aligned_ner(self):
|
||||||
if not self.y.is_nered:
|
if not self.y.is_nered:
|
||||||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
||||||
x_text = self.x.text
|
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
||||||
# Get a list of entities, and make spans for non-entity tokens.
|
# Default to 'None' for missing values
|
||||||
# We then work through the spans in order, trying to find them in
|
|
||||||
# the text and using that to get the offset. Any token that doesn't
|
|
||||||
# get a tag set this way is tagged None.
|
|
||||||
# This could maybe be improved? It at least feels easy to reason about.
|
|
||||||
y_spans = list(self.y.ents)
|
|
||||||
y_spans.sort()
|
|
||||||
x_text_offset = 0
|
|
||||||
x_spans = []
|
|
||||||
for y_span in y_spans:
|
|
||||||
if x_text.count(y_span.text) >= 1:
|
|
||||||
start_char = x_text.index(y_span.text) + x_text_offset
|
|
||||||
end_char = start_char + len(y_span.text)
|
|
||||||
x_span = self.x.char_span(start_char, end_char, label=y_span.label)
|
|
||||||
if x_span is not None:
|
|
||||||
x_spans.append(x_span)
|
|
||||||
x_text = self.x.text[end_char:]
|
|
||||||
x_text_offset = end_char
|
|
||||||
x_tags = biluo_tags_from_offsets(
|
x_tags = biluo_tags_from_offsets(
|
||||||
self.x,
|
self.x,
|
||||||
[(e.start_char, e.end_char, e.label_) for e in x_spans],
|
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||||
missing=None
|
missing=None
|
||||||
)
|
)
|
||||||
gold_to_cand = self.alignment.gold_to_cand
|
# Now fill the tokens we can align to O.
|
||||||
for token in self.y:
|
O = 2 # I=1, O=2, B=3
|
||||||
if token.ent_iob_ == "O":
|
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||||
cand_i = gold_to_cand[token.i]
|
if x_tags[i] is None:
|
||||||
if cand_i is not None and x_tags[cand_i] is None:
|
if ent_iob == O:
|
||||||
x_tags[cand_i] = "O"
|
x_tags[i] = "O"
|
||||||
i2j_multi = self.alignment.i2j_multi
|
elif self.x[i].is_space:
|
||||||
for i, tag in enumerate(x_tags):
|
|
||||||
if tag is None and i in i2j_multi:
|
|
||||||
gold_i = i2j_multi[i]
|
|
||||||
if gold_i is not None and self.y[gold_i].ent_iob_ == "O":
|
|
||||||
x_tags[i] = "O"
|
x_tags[i] = "O"
|
||||||
return x_tags
|
return x_tags
|
||||||
|
|
||||||
|
@ -194,25 +193,22 @@ cdef class Example:
|
||||||
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
|
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
def split_sents(self):
|
def split_sents(self):
|
||||||
""" Split the token annotations into multiple Examples based on
|
""" Split the token annotations into multiple Examples based on
|
||||||
sent_starts and return a list of the new Examples"""
|
sent_starts and return a list of the new Examples"""
|
||||||
if not self.reference.is_sentenced:
|
if not self.reference.is_sentenced:
|
||||||
return [self]
|
return [self]
|
||||||
|
|
||||||
sent_starts = self.get_aligned("SENT_START")
|
align = self.alignment.y2x
|
||||||
sent_starts.append(1) # appending virtual start of a next sentence to facilitate search
|
seen_indices = set()
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
pred_start = 0
|
for y_sent in self.reference.sents:
|
||||||
for sent in self.reference.sents:
|
indices = align[y_sent.start : y_sent.end].data.ravel()
|
||||||
new_ref = sent.as_doc()
|
indices = [idx for idx in indices if idx not in seen_indices]
|
||||||
pred_end = sent_starts.index(1, pred_start+1) # find where the next sentence starts
|
if indices:
|
||||||
new_pred = self.predicted[pred_start : pred_end].as_doc()
|
x_sent = self.predicted[indices[0] : indices[-1] + 1]
|
||||||
output.append(Example(new_pred, new_ref))
|
output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
|
||||||
pred_start = pred_end
|
seen_indices.update(indices)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
|
@ -258,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
try:
|
||||||
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
|
except TypeError:
|
||||||
|
types= set([type(v) for v in value])
|
||||||
|
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||||
|
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
|
@ -540,19 +540,15 @@ class Language(object):
|
||||||
|
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
component_deps = count_pipeline_interdependencies(self.pipeline)
|
|
||||||
# Determine whether component should set annotations. In theory I guess
|
|
||||||
# we should do this by inspecting the meta? Or we could just always
|
|
||||||
# say "yes"
|
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
component_cfg.setdefault(name, {})
|
component_cfg.setdefault(name, {})
|
||||||
component_cfg[name].setdefault("drop", drop)
|
component_cfg[name].setdefault("drop", drop)
|
||||||
component_cfg[name]["set_annotations"] = bool(component_deps[i])
|
component_cfg[name].setdefault("set_annotations", False)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, "update"):
|
if not hasattr(proc, "update"):
|
||||||
continue
|
continue
|
||||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||||
if sgd is not False:
|
if sgd not in (None, False):
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "model"):
|
if hasattr(proc, "model"):
|
||||||
proc.model.finish_update(sgd)
|
proc.model.finish_update(sgd)
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from thinc.api import Model, normal_init
|
from thinc.api import Model, normal_init
|
||||||
|
|
||||||
|
|
||||||
def PrecomputableAffine(nO, nI, nF, nP):
|
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||||
model = Model(
|
model = Model(
|
||||||
"precomputable_affine",
|
"precomputable_affine",
|
||||||
forward,
|
forward,
|
||||||
init=init,
|
init=init,
|
||||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||||
params={"W": None, "b": None, "pad": None},
|
params={"W": None, "b": None, "pad": None},
|
||||||
|
attrs={"dropout_rate": dropout}
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@ -48,17 +49,14 @@ def forward(model, X, is_train):
|
||||||
model.inc_grad("b", dY.sum(axis=0))
|
model.inc_grad("b", dY.sum(axis=0))
|
||||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||||
|
|
||||||
Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
|
Wopfi = W.transpose((1, 2, 0, 3))
|
||||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||||
|
|
||||||
# Reuse the buffer
|
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||||
dWopfi = Wopfi
|
|
||||||
dWopfi.fill(0.0)
|
|
||||||
model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
|
||||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
# (o, p, f, i) --> (f, o, p, i)
|
||||||
dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
|
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||||
model.inc_grad("W", dWopfi)
|
model.inc_grad("W", dWopfi)
|
||||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||||
|
|
||||||
|
|
|
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
norm = HashEmbed(
|
norm = HashEmbed(
|
||||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
|
nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
|
||||||
seed=0
|
seed=0
|
||||||
)
|
)
|
||||||
if subword_features:
|
if subword_features:
|
||||||
prefix = HashEmbed(
|
prefix = HashEmbed(
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
|
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
|
||||||
seed=1
|
seed=1
|
||||||
)
|
)
|
||||||
suffix = HashEmbed(
|
suffix = HashEmbed(
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
|
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
|
||||||
seed=2
|
seed=2
|
||||||
)
|
)
|
||||||
shape = HashEmbed(
|
shape = HashEmbed(
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
|
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
|
||||||
seed=3
|
seed=3
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
|
||||||
>> Maxout(
|
>> Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=width * columns,
|
nI=width * columns,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
),
|
),
|
||||||
|
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
|
||||||
>> Maxout(
|
>> Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=width * columns,
|
nI=width * columns,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
),
|
),
|
||||||
|
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
|
||||||
>> Maxout(
|
>> Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=width * columns,
|
nI=width * columns,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
),
|
),
|
||||||
|
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
|
||||||
reduce_dimensions = Maxout(
|
reduce_dimensions = Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=nM * nC + width,
|
nI=nM * nC + width,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
|
||||||
from ..syntax._parser_model import ParserStepModel
|
from ..syntax._parser_model import ParserStepModel
|
||||||
|
|
||||||
|
|
||||||
def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
||||||
"""Set up a stepwise transition-based model"""
|
"""Set up a stepwise transition-based model"""
|
||||||
if upper is None:
|
if upper is None:
|
||||||
has_upper = False
|
has_upper = False
|
||||||
|
|
|
@ -272,7 +272,7 @@ cdef class Morphology:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def feats_to_dict(feats):
|
def feats_to_dict(feats):
|
||||||
if not feats:
|
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||||
return {}
|
return {}
|
||||||
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
||||||
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
||||||
|
|
|
@ -3,7 +3,7 @@ cimport numpy as np
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import to_categorical
|
from thinc.api import SequenceCategoricalCrossentropy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
|
||||||
doc.is_morphed = True
|
doc.is_morphed = True
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
truths = []
|
||||||
cdef int idx = 0
|
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
|
||||||
guesses = scores.argmax(axis=1)
|
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
eg_truths = []
|
||||||
pos_tags = eg.get_aligned("POS", as_string=True)
|
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||||
morphs = eg.get_aligned("MORPH", as_string=True)
|
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||||
for i in range(len(morphs)):
|
for i in range(len(morphs)):
|
||||||
|
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
|
||||||
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
||||||
if morph == "":
|
if morph == "":
|
||||||
morph = Morphology.EMPTY_MORPH
|
morph = Morphology.EMPTY_MORPH
|
||||||
if morph is None:
|
eg_truths.append(morph)
|
||||||
correct[idx] = guesses[idx]
|
truths.append(eg_truths)
|
||||||
elif morph in tag_index:
|
d_scores, loss = loss_func(scores, truths)
|
||||||
correct[idx] = tag_index[morph]
|
if self.model.ops.xp.isnan(loss):
|
||||||
else:
|
raise ValueError("nan value when computing loss")
|
||||||
correct[idx] = 0
|
|
||||||
known_labels[idx] = 0.
|
|
||||||
idx += 1
|
|
||||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
|
||||||
loss = (d_scores**2).sum()
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
|
|
@ -334,7 +334,7 @@ class Tagger(Pipe):
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
@ -521,29 +521,23 @@ class SentenceRecognizer(Tagger):
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
labels = self.labels
|
||||||
tag_index = range(len(self.labels))
|
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||||
cdef int idx = 0
|
truths = []
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
|
||||||
guesses = scores.argmax(axis=1)
|
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
sent_starts = eg.get_aligned("sent_start")
|
eg_truth = []
|
||||||
for sent_start in sent_starts:
|
for x in eg.get_aligned("sent_start"):
|
||||||
if sent_start is None:
|
if x == None:
|
||||||
correct[idx] = guesses[idx]
|
eg_truth.append(None)
|
||||||
elif sent_start in tag_index:
|
elif x == 1:
|
||||||
correct[idx] = sent_start
|
eg_truth.append(labels[1])
|
||||||
else:
|
else:
|
||||||
correct[idx] = 0
|
# anything other than 1: 0, -1, -1 as uint64
|
||||||
known_labels[idx] = 0.
|
eg_truth.append(labels[0])
|
||||||
idx += 1
|
truths.append(eg_truth)
|
||||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
d_scores, loss = loss_func(scores, truths)
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
if self.model.ops.xp.isnan(loss):
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
raise ValueError("nan value when computing loss")
|
||||||
loss = (d_scores**2).sum()
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
|
|
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
|
||||||
class ProjectConfigAsset(BaseModel):
|
class ProjectConfigAsset(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
url: StrictStr = Field(..., title="URL of asset")
|
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
||||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||||
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -326,10 +326,11 @@ class Scorer(object):
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
continue
|
continue
|
||||||
gold_i = align.cand_to_gold[token.i]
|
if align.x2y.lengths[token.i] != 1:
|
||||||
if gold_i is None:
|
|
||||||
self.tokens.fp += 1
|
self.tokens.fp += 1
|
||||||
|
gold_i = None
|
||||||
else:
|
else:
|
||||||
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||||
self.tokens.tp += 1
|
self.tokens.tp += 1
|
||||||
cand_tags.add((gold_i, token.tag_))
|
cand_tags.add((gold_i, token.tag_))
|
||||||
cand_pos.add((gold_i, token.pos_))
|
cand_pos.add((gold_i, token.pos_))
|
||||||
|
@ -345,7 +346,10 @@ class Scorer(object):
|
||||||
if token.is_sent_start:
|
if token.is_sent_start:
|
||||||
cand_sent_starts.add(gold_i)
|
cand_sent_starts.add(gold_i)
|
||||||
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
||||||
gold_head = align.cand_to_gold[token.head.i]
|
if align.x2y.lengths[token.head.i] == 1:
|
||||||
|
gold_head = align.x2y[token.head.i].dataXd[0, 0]
|
||||||
|
else:
|
||||||
|
gold_head = None
|
||||||
# None is indistinct, so we can't just add it to the set
|
# None is indistinct, so we can't just add it to the set
|
||||||
# Multiple (None, None) deps are possible
|
# Multiple (None, None) deps are possible
|
||||||
if gold_i is None or gold_head is None:
|
if gold_i is None or gold_head is None:
|
||||||
|
@ -381,15 +385,9 @@ class Scorer(object):
|
||||||
gold_ents.add(gold_ent)
|
gold_ents.add(gold_ent)
|
||||||
gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||||
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||||
for ent in doc.ents:
|
for ent in example.get_aligned_spans_x2y(doc.ents):
|
||||||
first = align.cand_to_gold[ent.start]
|
cand_ents.add((ent.label_, ent.start, ent.end - 1))
|
||||||
last = align.cand_to_gold[ent.end - 1]
|
cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||||
if first is None or last is None:
|
|
||||||
self.ner.fp += 1
|
|
||||||
self.ner_per_ents[ent.label_].fp += 1
|
|
||||||
else:
|
|
||||||
cand_ents.add((ent.label_, first, last))
|
|
||||||
cand_per_ents[ent.label_].add((ent.label_, first, last))
|
|
||||||
# Scores per ent
|
# Scores per ent
|
||||||
for k, v in self.ner_per_ents.items():
|
for k, v in self.ner_per_ents.items():
|
||||||
if k in cand_per_ents:
|
if k in cand_per_ents:
|
||||||
|
|
|
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
||||||
|
|
||||||
|
|
||||||
class ParserStepModel(Model):
|
class ParserStepModel(Model):
|
||||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
|
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||||
|
dropout=0.1):
|
||||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||||
self.attrs["has_upper"] = has_upper
|
self.attrs["has_upper"] = has_upper
|
||||||
|
self.attrs["dropout_rate"] = dropout
|
||||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||||
if layers[1].get_dim("nP") >= 2:
|
if layers[1].get_dim("nP") >= 2:
|
||||||
activation = "maxout"
|
activation = "maxout"
|
||||||
|
@ -289,11 +291,17 @@ class ParserStepModel(Model):
|
||||||
self.bp_tokvecs(d_tokvecs[:-1])
|
self.bp_tokvecs(d_tokvecs[:-1])
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
|
NUMPY_OPS = NumpyOps()
|
||||||
|
|
||||||
def step_forward(model: ParserStepModel, states, is_train):
|
def step_forward(model: ParserStepModel, states, is_train):
|
||||||
token_ids = model.get_token_ids(states)
|
token_ids = model.get_token_ids(states)
|
||||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||||
|
mask = None
|
||||||
if model.attrs["has_upper"]:
|
if model.attrs["has_upper"]:
|
||||||
|
dropout_rate = model.attrs["dropout_rate"]
|
||||||
|
if is_train and dropout_rate > 0:
|
||||||
|
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||||
|
vector *= mask
|
||||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||||
else:
|
else:
|
||||||
scores = NumpyOps().asarray(vector)
|
scores = NumpyOps().asarray(vector)
|
||||||
|
@ -305,6 +313,8 @@ def step_forward(model: ParserStepModel, states, is_train):
|
||||||
# Zero vectors for unseen classes
|
# Zero vectors for unseen classes
|
||||||
d_scores *= model._class_mask
|
d_scores *= model._class_mask
|
||||||
d_vector = get_d_vector(d_scores)
|
d_vector = get_d_vector(d_scores)
|
||||||
|
if mask is not None:
|
||||||
|
d_vector *= mask
|
||||||
if isinstance(model.state2vec.ops, CupyOps) \
|
if isinstance(model.state2vec.ops, CupyOps) \
|
||||||
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
|
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
|
@ -437,7 +447,7 @@ cdef class precompute_hiddens:
|
||||||
sum_state_features(<float*>state_vector.data,
|
sum_state_features(<float*>state_vector.data,
|
||||||
feat_weights, &ids[0,0],
|
feat_weights, &ids[0,0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||||
state_vector = state_vector + self.bias
|
state_vector += self.bias
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
def backward(d_state_vector_ids):
|
def backward(d_state_vector_ids):
|
||||||
|
|
|
@ -65,7 +65,6 @@ cdef class Parser:
|
||||||
self.set_output(self.moves.n_moves)
|
self.set_output(self.moves.n_moves)
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
||||||
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
|
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
for multitask in cfg.get("multitasks", []):
|
for multitask in cfg.get("multitasks", []):
|
||||||
self.add_multitask_objective(multitask)
|
self.add_multitask_objective(multitask)
|
||||||
|
@ -280,11 +279,12 @@ cdef class Parser:
|
||||||
[eg.predicted for eg in examples])
|
[eg.predicted for eg in examples])
|
||||||
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
||||||
# Chop sequences into lengths of this many transitions, to make the
|
# Chop sequences into lengths of this many transitions, to make the
|
||||||
# batch uniform length. We randomize this to overfit less.
|
# batch uniform length.
|
||||||
|
# We used to randomize this, but it's not clear that actually helps?
|
||||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
cut_size = self.cfg["update_with_oracle_cut_size"]
|
||||||
states, golds, max_steps = self._init_gold_batch(
|
states, golds, max_steps = self._init_gold_batch(
|
||||||
examples,
|
examples,
|
||||||
max_length=numpy.random.choice(range(5, cut_size))
|
max_length=cut_size
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||||
|
@ -292,24 +292,15 @@ cdef class Parser:
|
||||||
if not states:
|
if not states:
|
||||||
return losses
|
return losses
|
||||||
all_states = list(states)
|
all_states = list(states)
|
||||||
states_golds = zip(states, golds)
|
states_golds = list(zip(states, golds))
|
||||||
for _ in range(max_steps):
|
while states_golds:
|
||||||
if not states_golds:
|
|
||||||
break
|
|
||||||
states, golds = zip(*states_golds)
|
states, golds = zip(*states_golds)
|
||||||
scores, backprop = model.begin_update(states)
|
scores, backprop = model.begin_update(states)
|
||||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||||
if self.cfg["normalize_gradients_with_batch_size"]:
|
# Note that the gradient isn't normalized by the batch size
|
||||||
# We have to be very careful how we do this, because of the way we
|
# here, because our "samples" are really the states...But we
|
||||||
# cut up the batch. We subdivide long sequences. If we normalize
|
# can't normalize by the number of states either, as then we'd
|
||||||
# naively, we end up normalizing by sequence length, which
|
# be getting smaller gradients for states in long sequences.
|
||||||
# is bad: that would mean that states in long sequences
|
|
||||||
# consistently get smaller gradients. Imagine if we have two
|
|
||||||
# sequences, one length 1000, one length 20. If we cut up
|
|
||||||
# the 1k sequence so that we have a "batch" of 50 subsequences,
|
|
||||||
# we don't want the gradients to get 50 times smaller!
|
|
||||||
d_scores /= n_examples
|
|
||||||
|
|
||||||
backprop(d_scores)
|
backprop(d_scores)
|
||||||
# Follow the predicted action
|
# Follow the predicted action
|
||||||
self.transition_states(states, scores)
|
self.transition_states(states, scores)
|
||||||
|
@ -407,6 +398,7 @@ cdef class Parser:
|
||||||
cpu_log_loss(c_d_scores,
|
cpu_log_loss(c_d_scores,
|
||||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||||
c_d_scores += d_scores.shape[1]
|
c_d_scores += d_scores.shape[1]
|
||||||
|
# Note that we don't normalize this. See comment in update() for why.
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
losses[self.name] += (d_scores**2).sum()
|
losses[self.name] += (d_scores**2).sum()
|
||||||
|
@ -525,21 +517,25 @@ cdef class Parser:
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||||
|
states = []
|
||||||
|
golds = []
|
||||||
kept = []
|
kept = []
|
||||||
max_length_seen = 0
|
max_length_seen = 0
|
||||||
for state, eg in zip(all_states, examples):
|
for state, eg in zip(all_states, examples):
|
||||||
if self.moves.has_gold(eg) and not state.is_final():
|
if self.moves.has_gold(eg) and not state.is_final():
|
||||||
gold = self.moves.init_gold(state, eg)
|
gold = self.moves.init_gold(state, eg)
|
||||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
if len(eg.x) < max_length:
|
||||||
state.copy(), gold)
|
states.append(state)
|
||||||
kept.append((eg, state, gold, oracle_actions))
|
golds.append(gold)
|
||||||
min_length = min(min_length, len(oracle_actions))
|
else:
|
||||||
max_length_seen = max(max_length, len(oracle_actions))
|
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||||
|
state.copy(), gold)
|
||||||
|
kept.append((eg, state, gold, oracle_actions))
|
||||||
|
min_length = min(min_length, len(oracle_actions))
|
||||||
|
max_length_seen = max(max_length, len(oracle_actions))
|
||||||
if not kept:
|
if not kept:
|
||||||
return [], [], 0
|
return states, golds, 0
|
||||||
max_length = max(min_length, min(max_length, max_length_seen))
|
max_length = max(min_length, min(max_length, max_length_seen))
|
||||||
states = []
|
|
||||||
golds = []
|
|
||||||
cdef int clas
|
cdef int clas
|
||||||
max_moves = 0
|
max_moves = 0
|
||||||
for eg, state, gold, oracle_actions in kept:
|
for eg, state, gold, oracle_actions in kept:
|
||||||
|
|
|
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
|
|
||||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
assert contains_cycle(tree) is None
|
assert contains_cycle(tree) is None
|
||||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
assert contains_cycle(cyclic_tree) == {3, 4, 5}
|
||||||
assert contains_cycle(partial_tree) is None
|
assert contains_cycle(partial_tree) is None
|
||||||
assert contains_cycle(multirooted_tree) is None
|
assert contains_cycle(multirooted_tree) is None
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,11 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
# add some cases where SENT_START == -1
|
||||||
|
train_examples[0].reference[10].is_sent_start = False
|
||||||
|
train_examples[1].reference[1].is_sent_start = False
|
||||||
|
train_examples[1].reference[11].is_sent_start = False
|
||||||
|
|
||||||
nlp.add_pipe(senter)
|
nlp.add_pipe(senter)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ def test_issue2070():
|
||||||
assert len(doc) == 11
|
assert len(doc) == 11
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2179():
|
def test_issue2179():
|
||||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2482():
|
def test_issue2482():
|
||||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
|
|
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
|
||||||
assert doc[0].like_num
|
assert doc[0].like_num
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2800():
|
def test_issue2800():
|
||||||
"""Test issue that arises when too many labels are added to NER model.
|
"""Test issue that arises when too many labels are added to NER model.
|
||||||
Used to cause segfault.
|
Used to cause segfault.
|
||||||
"""
|
"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
train_data = []
|
train_data = []
|
||||||
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
|
train_data.extend(
|
||||||
|
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
|
||||||
|
)
|
||||||
entity_types = [str(i) for i in range(1000)]
|
entity_types = [str(i) for i in range(1000)]
|
||||||
ner = nlp.create_pipe("ner")
|
ner = nlp.create_pipe("ner")
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
|
|
|
@ -88,6 +88,7 @@ def test_issue3199():
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
assert list(doc[0:3].noun_chunks) == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||||
mapped to classes incorrectly after loading the model, when the labels
|
mapped to classes incorrectly after loading the model, when the labels
|
||||||
|
|
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
|
@ -0,0 +1,472 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.pipeline import EntityRuler, DependencyParser
|
||||||
|
from spacy.pipeline.defaults import default_parser
|
||||||
|
from spacy import displacy, load
|
||||||
|
from spacy.displacy import parse_deps
|
||||||
|
from spacy.tokens import Doc, Token
|
||||||
|
from spacy.matcher import Matcher, PhraseMatcher
|
||||||
|
from spacy.errors import MatchPatternError
|
||||||
|
from spacy.util import minibatch
|
||||||
|
from spacy.gold import Example
|
||||||
|
from spacy.lang.hi import Hindi
|
||||||
|
from spacy.lang.es import Spanish
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.attrs import IS_ALPHA
|
||||||
|
from thinc.api import compounding
|
||||||
|
import spacy
|
||||||
|
import srsly
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from ..util import make_tempdir, get_doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||||
|
def test_issue3521(en_tokenizer, word):
|
||||||
|
tok = en_tokenizer(word)[1]
|
||||||
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
|
assert tok.is_stop
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_1(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
ruler_bytes = ruler.to_bytes()
|
||||||
|
assert len(ruler) == len(patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
assert ruler.overwrite
|
||||||
|
new_ruler = EntityRuler(nlp)
|
||||||
|
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
assert len(new_ruler.labels) == 4
|
||||||
|
assert new_ruler.overwrite == ruler.overwrite
|
||||||
|
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_2(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||||
|
new_ruler = EntityRuler(nlp)
|
||||||
|
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
for pattern in ruler.patterns:
|
||||||
|
assert pattern in new_ruler.patterns
|
||||||
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_3(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
out_file = tmpdir / "entity_ruler"
|
||||||
|
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||||
|
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||||
|
for pattern in ruler.patterns:
|
||||||
|
assert pattern in new_ruler.patterns
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue_3526_4(en_vocab):
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||||
|
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
nlp.to_disk(tmpdir)
|
||||||
|
ruler = nlp.get_pipe("entity_ruler")
|
||||||
|
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
assert ruler.overwrite is True
|
||||||
|
nlp2 = load(tmpdir)
|
||||||
|
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||||
|
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
assert new_ruler.overwrite is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3531():
|
||||||
|
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||||
|
example_dep = {
|
||||||
|
"words": [
|
||||||
|
{"text": "But", "tag": "CCONJ"},
|
||||||
|
{"text": "Google", "tag": "PROPN"},
|
||||||
|
{"text": "is", "tag": "VERB"},
|
||||||
|
{"text": "starting", "tag": "VERB"},
|
||||||
|
{"text": "from", "tag": "ADP"},
|
||||||
|
{"text": "behind.", "tag": "ADV"},
|
||||||
|
],
|
||||||
|
"arcs": [
|
||||||
|
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||||
|
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||||
|
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
example_ent = {
|
||||||
|
"text": "But Google is starting from behind.",
|
||||||
|
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||||
|
}
|
||||||
|
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||||
|
assert dep_html
|
||||||
|
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||||
|
assert ent_html
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3540(en_vocab):
|
||||||
|
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
tensor = numpy.asarray(
|
||||||
|
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||||
|
dtype="f",
|
||||||
|
)
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
doc.tensor = tensor
|
||||||
|
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
assert [token.text for token in doc] == gold_text
|
||||||
|
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
assert [token.lemma_ for token in doc] == gold_lemma
|
||||||
|
vectors_1 = [token.vector for token in doc]
|
||||||
|
assert len(vectors_1) == len(doc)
|
||||||
|
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
heads = [(doc[3], 1), doc[2]]
|
||||||
|
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||||
|
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||||
|
|
||||||
|
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
|
assert [token.text for token in doc] == gold_text
|
||||||
|
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
|
assert [token.lemma_ for token in doc] == gold_lemma
|
||||||
|
vectors_2 = [token.vector for token in doc]
|
||||||
|
assert len(vectors_2) == len(doc)
|
||||||
|
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||||
|
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||||
|
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||||
|
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||||
|
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3549(en_vocab):
|
||||||
|
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||||
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
|
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||||
|
matcher.add("GOOD", [pattern])
|
||||||
|
with pytest.raises(MatchPatternError):
|
||||||
|
matcher.add("BAD", [[{"X": "Y"}]])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_issue3555(en_vocab):
|
||||||
|
"""Test that custom extensions with default None don't break matcher."""
|
||||||
|
Token.set_extension("issue3555", default=None)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["have", "apple"])
|
||||||
|
matcher(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3611():
|
||||||
|
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff",
|
||||||
|
]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
# preparing the data
|
||||||
|
train_data = []
|
||||||
|
for text, train_instance in zip(x_train, y_train):
|
||||||
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||||
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||||
|
# add a text categorizer component
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
|
)
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
# training the network
|
||||||
|
with nlp.select_pipes(enable="textcat"):
|
||||||
|
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(
|
||||||
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3625():
|
||||||
|
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||||
|
nlp = Hindi()
|
||||||
|
doc = nlp("hi. how हुए. होटल, होटल")
|
||||||
|
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||||
|
assert [token.text for token in doc] == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3803():
|
||||||
|
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||||
|
nlp = Spanish()
|
||||||
|
text = "2 dos 1000 mil 12 doce"
|
||||||
|
doc = nlp(text)
|
||||||
|
|
||||||
|
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3830_no_subtok():
|
||||||
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
parser.begin_training(lambda: [])
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3830_with_subtok():
|
||||||
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
|
config = {
|
||||||
|
"learn_tokens": True,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
parser.begin_training(lambda: [])
|
||||||
|
assert "subtok" in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3839(en_vocab):
|
||||||
|
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||||
|
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
match_id = "PATTERN"
|
||||||
|
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
|
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
|
matcher.add(match_id, [pattern1])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add(match_id, [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sentence",
|
||||||
|
[
|
||||||
|
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||||
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||||
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||||
|
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||||
|
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_issue3869(sentence):
|
||||||
|
"""Test that the Doc's count_by function works consistently"""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(sentence)
|
||||||
|
count = 0
|
||||||
|
for token in doc:
|
||||||
|
count += token.is_alpha
|
||||||
|
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3879(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||||
|
assert len(doc) == 5
|
||||||
|
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3880():
|
||||||
|
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||||
|
|
||||||
|
Fixed in v7.0.5 of Thinc.
|
||||||
|
"""
|
||||||
|
texts = ["hello", "world", "", ""]
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
|
nlp.get_pipe("parser").add_label("dep")
|
||||||
|
nlp.get_pipe("ner").add_label("PERSON")
|
||||||
|
nlp.get_pipe("tagger").add_label("NN")
|
||||||
|
nlp.begin_training()
|
||||||
|
for doc in nlp.pipe(texts):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3882(en_vocab):
|
||||||
|
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||||
|
copy of the Doc.
|
||||||
|
"""
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
|
doc.is_parsed = True
|
||||||
|
doc.user_data["test"] = set()
|
||||||
|
parse_deps(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3951(en_vocab):
|
||||||
|
"""Test that combinations of optional rules are matched correctly."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [
|
||||||
|
{"LOWER": "hello"},
|
||||||
|
{"LOWER": "this", "OP": "?"},
|
||||||
|
{"OP": "?"},
|
||||||
|
{"LOWER": "world"},
|
||||||
|
]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3959():
|
||||||
|
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(
|
||||||
|
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||||
|
)
|
||||||
|
assert doc[0].pos_ == ""
|
||||||
|
doc[0].pos_ = "NOUN"
|
||||||
|
assert doc[0].pos_ == "NOUN"
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = tmp_dir / "my_doc"
|
||||||
|
doc.to_disk(file_path)
|
||||||
|
doc2 = nlp("")
|
||||||
|
doc2.from_disk(file_path)
|
||||||
|
assert doc2[0].pos_ == "NOUN"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962(en_vocab):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
# fmt: off
|
||||||
|
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||||
|
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
span2 = doc[1:5] # "jests at scars ,"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
# head set to itself, being the new artificial root
|
||||||
|
assert doc2[0].head.text == "jests"
|
||||||
|
assert doc2[0].dep_ == "dep"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||||
|
assert doc2[3].dep_ == "dep"
|
||||||
|
# We should still have 1 sentence
|
||||||
|
assert len(list(doc2.sents)) == 1
|
||||||
|
span3 = doc[6:9] # "never felt a"
|
||||||
|
doc3 = span3.as_doc()
|
||||||
|
doc3_json = doc3.to_json()
|
||||||
|
assert doc3_json
|
||||||
|
assert doc3[0].head.text == "felt"
|
||||||
|
assert doc3[0].dep_ == "neg"
|
||||||
|
assert doc3[1].head.text == "felt"
|
||||||
|
assert doc3[1].dep_ == "ROOT"
|
||||||
|
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||||
|
assert doc3[2].dep_ == "dep"
|
||||||
|
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||||
|
assert len(list(doc3.sents)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962_long(en_vocab):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
# fmt: off
|
||||||
|
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||||
|
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
# head set to itself, being the new artificial root (in sentence 1)
|
||||||
|
assert doc2[0].head.text == "jests"
|
||||||
|
assert doc2[0].dep_ == "ROOT"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests"
|
||||||
|
assert doc2[3].dep_ == "punct"
|
||||||
|
# head set to itself, being the new artificial root (in sentence 2)
|
||||||
|
assert doc2[4].head.text == "They"
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
# head set to the new artificial head (in sentence 2)
|
||||||
|
assert doc2[4].head.text == "They"
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
# We should still have 2 sentences
|
||||||
|
sents = list(doc2.sents)
|
||||||
|
assert len(sents) == 2
|
||||||
|
assert sents[0].text == "jests at scars ."
|
||||||
|
assert sents[1].text == "They never"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3972(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
# We should have a match for each of the two rules
|
||||||
|
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||||
|
assert "A" in found_ids
|
||||||
|
assert "B" in found_ids
|
|
@ -1,8 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
|
||||||
def test_issue3521(en_tokenizer, word):
|
|
||||||
tok = en_tokenizer(word)[1]
|
|
||||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
|
||||||
assert tok.is_stop
|
|
|
@ -1,85 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
from spacy import load
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def patterns():
|
|
||||||
return [
|
|
||||||
{"label": "HELLO", "pattern": "hello world"},
|
|
||||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
|
||||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
|
||||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
|
||||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def add_ent():
|
|
||||||
def add_ent_component(doc):
|
|
||||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
|
||||||
return doc
|
|
||||||
|
|
||||||
return add_ent_component
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
ruler_bytes = ruler.to_bytes()
|
|
||||||
assert len(ruler) == len(patterns)
|
|
||||||
assert len(ruler.labels) == 4
|
|
||||||
assert ruler.overwrite
|
|
||||||
new_ruler = EntityRuler(nlp)
|
|
||||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
assert len(new_ruler.labels) == 4
|
|
||||||
assert new_ruler.overwrite == ruler.overwrite
|
|
||||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
|
||||||
new_ruler = EntityRuler(nlp)
|
|
||||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
for pattern in ruler.patterns:
|
|
||||||
assert pattern in new_ruler.patterns
|
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
out_file = tmpdir / "entity_ruler"
|
|
||||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
|
||||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
|
||||||
for pattern in ruler.patterns:
|
|
||||||
assert pattern in new_ruler.patterns
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
|
||||||
|
|
||||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
nlp.to_disk(tmpdir)
|
|
||||||
ruler = nlp.get_pipe("entity_ruler")
|
|
||||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
|
||||||
assert ruler.overwrite is True
|
|
||||||
nlp2 = load(tmpdir)
|
|
||||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
|
||||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
|
||||||
assert new_ruler.overwrite is True
|
|
|
@ -1,30 +0,0 @@
|
||||||
from spacy import displacy
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3531():
|
|
||||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
|
||||||
example_dep = {
|
|
||||||
"words": [
|
|
||||||
{"text": "But", "tag": "CCONJ"},
|
|
||||||
{"text": "Google", "tag": "PROPN"},
|
|
||||||
{"text": "is", "tag": "VERB"},
|
|
||||||
{"text": "starting", "tag": "VERB"},
|
|
||||||
{"text": "from", "tag": "ADP"},
|
|
||||||
{"text": "behind.", "tag": "ADV"},
|
|
||||||
],
|
|
||||||
"arcs": [
|
|
||||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
|
||||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
|
||||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
|
||||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
|
||||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
example_ent = {
|
|
||||||
"text": "But Google is starting from behind.",
|
|
||||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
|
||||||
}
|
|
||||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
|
||||||
assert dep_html
|
|
||||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
|
||||||
assert ent_html
|
|
|
@ -1,44 +0,0 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3540(en_vocab):
|
|
||||||
|
|
||||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
tensor = np.asarray(
|
|
||||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
|
||||||
dtype="f",
|
|
||||||
)
|
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
doc.tensor = tensor
|
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
assert [token.text for token in doc] == gold_text
|
|
||||||
|
|
||||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
assert [token.lemma_ for token in doc] == gold_lemma
|
|
||||||
|
|
||||||
vectors_1 = [token.vector for token in doc]
|
|
||||||
assert len(vectors_1) == len(doc)
|
|
||||||
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
heads = [(doc[3], 1), doc[2]]
|
|
||||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
|
||||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
|
||||||
assert [token.text for token in doc] == gold_text
|
|
||||||
|
|
||||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
|
||||||
assert [token.lemma_ for token in doc] == gold_lemma
|
|
||||||
|
|
||||||
vectors_2 = [token.vector for token in doc]
|
|
||||||
assert len(vectors_2) == len(doc)
|
|
||||||
|
|
||||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
|
||||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
|
||||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
|
||||||
|
|
||||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
|
||||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
|
@ -1,12 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.errors import MatchPatternError
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3549(en_vocab):
|
|
||||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
|
||||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
|
||||||
matcher.add("GOOD", [pattern])
|
|
||||||
with pytest.raises(MatchPatternError):
|
|
||||||
matcher.add("BAD", [[{"X": "Y"}]])
|
|
|
@ -1,14 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Doc, Token
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue3555(en_vocab):
|
|
||||||
"""Test that custom extensions with default None don't break matcher."""
|
|
||||||
Token.set_extension("issue3555", default=None)
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
doc = Doc(en_vocab, words=["have", "apple"])
|
|
||||||
matcher(doc)
|
|
|
@ -1,45 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3611():
|
|
||||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
|
||||||
x_train = [
|
|
||||||
"This is an offensive text",
|
|
||||||
"This is the second offensive text",
|
|
||||||
"inoff",
|
|
||||||
]
|
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
|
|
||||||
# preparing the data
|
|
||||||
train_data = []
|
|
||||||
for text, train_instance in zip(x_train, y_train):
|
|
||||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
||||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
||||||
|
|
||||||
# add a text categorizer component
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat",
|
|
||||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
||||||
)
|
|
||||||
|
|
||||||
for label in unique_classes:
|
|
||||||
textcat.add_label(label)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
|
|
||||||
# training the network
|
|
||||||
with nlp.select_pipes(enable="textcat"):
|
|
||||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
|
||||||
for i in range(3):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(
|
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
|
@ -1,9 +0,0 @@
|
||||||
from spacy.lang.hi import Hindi
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3625():
|
|
||||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
|
||||||
nlp = Hindi()
|
|
||||||
doc = nlp("hi. how हुए. होटल, होटल")
|
|
||||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
|
||||||
assert [token.text for token in doc] == expected
|
|
|
@ -1,10 +0,0 @@
|
||||||
from spacy.lang.es import Spanish
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3803():
|
|
||||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
|
||||||
nlp = Spanish()
|
|
||||||
text = "2 dos 1000 mil 12 doce"
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.pipeline.pipes import DependencyParser
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_parser
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_no_subtok():
|
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
|
||||||
parser.add_label("nsubj")
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
parser.begin_training(lambda: [])
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_with_subtok():
|
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
|
||||||
config = {
|
|
||||||
"learn_tokens": True,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
|
||||||
parser.add_label("nsubj")
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
parser.begin_training(lambda: [])
|
|
||||||
assert "subtok" in parser.labels
|
|
|
@ -1,18 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3839(en_vocab):
|
|
||||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
|
||||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
match_id = "PATTERN"
|
|
||||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
|
||||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
|
||||||
matcher.add(match_id, [pattern1])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add(match_id, [pattern2])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
|
@ -1,25 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.attrs import IS_ALPHA
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence",
|
|
||||||
[
|
|
||||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
|
||||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
|
||||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
|
||||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
|
||||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_issue3869(sentence):
|
|
||||||
"""Test that the Doc's count_by function works consistently"""
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(sentence)
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
for token in doc:
|
|
||||||
count += token.is_alpha
|
|
||||||
|
|
||||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3879(en_vocab):
|
|
||||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
|
||||||
assert len(doc) == 5
|
|
||||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
|
|
@ -1,21 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3880():
|
|
||||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
|
||||||
|
|
||||||
Fixed in v7.0.5 of Thinc.
|
|
||||||
"""
|
|
||||||
texts = ["hello", "world", "", ""]
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
|
||||||
nlp.get_pipe("parser").add_label("dep")
|
|
||||||
nlp.get_pipe("ner").add_label("PERSON")
|
|
||||||
nlp.get_pipe("tagger").add_label("NN")
|
|
||||||
nlp.begin_training()
|
|
||||||
for doc in nlp.pipe(texts):
|
|
||||||
pass
|
|
|
@ -1,12 +0,0 @@
|
||||||
from spacy.displacy import parse_deps
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3882(en_vocab):
|
|
||||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
|
||||||
copy of the Doc.
|
|
||||||
"""
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
|
||||||
doc.is_parsed = True
|
|
||||||
doc.user_data["test"] = set()
|
|
||||||
parse_deps(doc)
|
|
|
@ -1,17 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3951(en_vocab):
|
|
||||||
"""Test that combinations of optional rules are matched correctly."""
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [
|
|
||||||
{"LOWER": "hello"},
|
|
||||||
{"LOWER": "this", "OP": "?"},
|
|
||||||
{"OP": "?"},
|
|
||||||
{"LOWER": "world"},
|
|
||||||
]
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 0
|
|
|
@ -1,26 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3959():
|
|
||||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(
|
|
||||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
|
||||||
)
|
|
||||||
assert doc[0].pos_ == ""
|
|
||||||
|
|
||||||
doc[0].pos_ = "NOUN"
|
|
||||||
assert doc[0].pos_ == "NOUN"
|
|
||||||
|
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
file_path = tmp_dir / "my_doc"
|
|
||||||
doc.to_disk(file_path)
|
|
||||||
|
|
||||||
doc2 = nlp("")
|
|
||||||
doc2.from_disk(file_path)
|
|
||||||
|
|
||||||
assert doc2[0].pos_ == "NOUN"
|
|
|
@ -1,117 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def doc(en_tokenizer):
|
|
||||||
text = "He jests at scars, that never felt a wound."
|
|
||||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
|
||||||
deps = [
|
|
||||||
"nsubj",
|
|
||||||
"ccomp",
|
|
||||||
"prep",
|
|
||||||
"pobj",
|
|
||||||
"punct",
|
|
||||||
"nsubj",
|
|
||||||
"neg",
|
|
||||||
"ROOT",
|
|
||||||
"det",
|
|
||||||
"dobj",
|
|
||||||
"punct",
|
|
||||||
]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962(doc):
|
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
||||||
span2 = doc[1:5] # "jests at scars ,"
|
|
||||||
doc2 = span2.as_doc()
|
|
||||||
doc2_json = doc2.to_json()
|
|
||||||
assert doc2_json
|
|
||||||
|
|
||||||
assert (
|
|
||||||
doc2[0].head.text == "jests"
|
|
||||||
) # head set to itself, being the new artificial root
|
|
||||||
assert doc2[0].dep_ == "dep"
|
|
||||||
assert doc2[1].head.text == "jests"
|
|
||||||
assert doc2[1].dep_ == "prep"
|
|
||||||
assert doc2[2].head.text == "at"
|
|
||||||
assert doc2[2].dep_ == "pobj"
|
|
||||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
|
||||||
assert doc2[3].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 1 sentence
|
|
||||||
assert len(list(doc2.sents)) == 1
|
|
||||||
|
|
||||||
span3 = doc[6:9] # "never felt a"
|
|
||||||
doc3 = span3.as_doc()
|
|
||||||
doc3_json = doc3.to_json()
|
|
||||||
assert doc3_json
|
|
||||||
|
|
||||||
assert doc3[0].head.text == "felt"
|
|
||||||
assert doc3[0].dep_ == "neg"
|
|
||||||
assert doc3[1].head.text == "felt"
|
|
||||||
assert doc3[1].dep_ == "ROOT"
|
|
||||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
|
||||||
assert doc3[2].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
|
||||||
assert len(list(doc3.sents)) == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def two_sent_doc(en_tokenizer):
|
|
||||||
text = "He jests at scars. They never felt a wound."
|
|
||||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
|
||||||
deps = [
|
|
||||||
"nsubj",
|
|
||||||
"ROOT",
|
|
||||||
"prep",
|
|
||||||
"pobj",
|
|
||||||
"punct",
|
|
||||||
"nsubj",
|
|
||||||
"neg",
|
|
||||||
"ROOT",
|
|
||||||
"det",
|
|
||||||
"dobj",
|
|
||||||
"punct",
|
|
||||||
]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962_long(two_sent_doc):
|
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
||||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
|
||||||
doc2 = span2.as_doc()
|
|
||||||
doc2_json = doc2.to_json()
|
|
||||||
assert doc2_json
|
|
||||||
|
|
||||||
assert (
|
|
||||||
doc2[0].head.text == "jests"
|
|
||||||
) # head set to itself, being the new artificial root (in sentence 1)
|
|
||||||
assert doc2[0].dep_ == "ROOT"
|
|
||||||
assert doc2[1].head.text == "jests"
|
|
||||||
assert doc2[1].dep_ == "prep"
|
|
||||||
assert doc2[2].head.text == "at"
|
|
||||||
assert doc2[2].dep_ == "pobj"
|
|
||||||
assert doc2[3].head.text == "jests"
|
|
||||||
assert doc2[3].dep_ == "punct"
|
|
||||||
assert (
|
|
||||||
doc2[4].head.text == "They"
|
|
||||||
) # head set to itself, being the new artificial root (in sentence 2)
|
|
||||||
assert doc2[4].dep_ == "dep"
|
|
||||||
assert (
|
|
||||||
doc2[4].head.text == "They"
|
|
||||||
) # head set to the new artificial head (in sentence 2)
|
|
||||||
assert doc2[4].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 2 sentences
|
|
||||||
sents = list(doc2.sents)
|
|
||||||
assert len(sents) == 2
|
|
||||||
assert sents[0].text == "jests at scars ."
|
|
||||||
assert sents[1].text == "They never"
|
|
|
@ -1,19 +0,0 @@
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3972(en_vocab):
|
|
||||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
|
||||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
|
||||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
|
||||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
|
||||||
matches = matcher(doc)
|
|
||||||
|
|
||||||
assert len(matches) == 2
|
|
||||||
|
|
||||||
# We should have a match for each of the two rules
|
|
||||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
|
||||||
assert "A" in found_ids
|
|
||||||
assert "B" in found_ids
|
|
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
|
@ -0,0 +1,469 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
|
||||||
|
from spacy.pipeline.defaults import default_ner
|
||||||
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
|
from spacy.gold import Example, Corpus
|
||||||
|
from spacy.gold.converters import json2docs
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.util import minibatch, ensure_path, load_model
|
||||||
|
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.lang.el import Greek
|
||||||
|
from spacy.language import Language
|
||||||
|
import spacy
|
||||||
|
from thinc.api import compounding
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4002(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||||
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||||
|
matcher.add("TEST", [pattern1])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||||
|
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
|
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||||
|
pattern2[0].norm_ = "c"
|
||||||
|
pattern2[1].norm_ = "d"
|
||||||
|
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||||
|
matcher.add("TEST", [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4030():
|
||||||
|
""" Test whether textcat works fine with empty doc """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff",
|
||||||
|
]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
# preparing the data
|
||||||
|
train_data = []
|
||||||
|
for text, train_instance in zip(x_train, y_train):
|
||||||
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||||
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||||
|
# add a text categorizer component
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
|
)
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
# training the network
|
||||||
|
with nlp.select_pipes(enable="textcat"):
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(
|
||||||
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||||
|
)
|
||||||
|
# processing of an empty doc should result in 0.0 for all categories
|
||||||
|
doc = nlp("")
|
||||||
|
assert doc.cats["offensive"] == 0.0
|
||||||
|
assert doc.cats["inoffensive"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4042():
|
||||||
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
# Add entity ruler
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [
|
||||||
|
{"label": "MY_ORG", "pattern": "Apple"},
|
||||||
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||||
|
]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||||
|
doc1 = nlp("What do you think about Apple ?")
|
||||||
|
assert doc1.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
|
||||||
|
nlp2 = load_model(output_dir)
|
||||||
|
doc2 = nlp2("What do you think about Apple ?")
|
||||||
|
assert doc2.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4042_bug2():
|
||||||
|
"""
|
||||||
|
Test that serialization of an NER works fine when new labels were added.
|
||||||
|
This is the second bug of two bugs underlying the issue 4042.
|
||||||
|
"""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab = nlp1.vocab
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner1 = nlp1.create_pipe("ner")
|
||||||
|
ner1.add_label("SOME_LABEL")
|
||||||
|
nlp1.add_pipe(ner1)
|
||||||
|
nlp1.begin_training()
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc1 = nlp1("What do you think about Apple ?")
|
||||||
|
assert len(ner1.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||||
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# reapply the NER - at this point it should resize itself
|
||||||
|
ner1(doc1)
|
||||||
|
assert len(ner1.labels) == 2
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
assert "MY_ORG" in ner1.labels
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
# assert IO goes fine
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||||
|
ner2.from_disk(output_dir)
|
||||||
|
assert len(ner2.labels) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4054(en_vocab):
|
||||||
|
"""Test that a new blank model can be made with a vocab from file,
|
||||||
|
and that serialization does not drop the language at any point."""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab1 = nlp1.vocab
|
||||||
|
with make_tempdir() as d:
|
||||||
|
vocab_dir = ensure_path(d / "vocab")
|
||||||
|
if not vocab_dir.exists():
|
||||||
|
vocab_dir.mkdir()
|
||||||
|
vocab1.to_disk(vocab_dir)
|
||||||
|
vocab2 = Vocab().from_disk(vocab_dir)
|
||||||
|
print("lang", vocab2.lang)
|
||||||
|
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||||
|
nlp_dir = ensure_path(d / "nlp")
|
||||||
|
if not nlp_dir.exists():
|
||||||
|
nlp_dir.mkdir()
|
||||||
|
nlp2.to_disk(nlp_dir)
|
||||||
|
nlp3 = load_model(nlp_dir)
|
||||||
|
assert nlp3.lang == "en"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4120(en_vocab):
|
||||||
|
"""Test that matches without a final {OP: ?} token are returned."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||||
|
doc1 = Doc(en_vocab, words=["a"])
|
||||||
|
assert len(matcher(doc1)) == 1 # works
|
||||||
|
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc2)) == 2 # fixed
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||||
|
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
|
assert len(matcher(doc3)) == 2 # works
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||||
|
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
|
assert len(matcher(doc4)) == 3 # fixed
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4133(en_vocab):
|
||||||
|
nlp = English()
|
||||||
|
vocab_bytes = nlp.vocab.to_bytes()
|
||||||
|
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||||
|
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
token.pos_ = pos[i]
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
doc_bytes = doc.to_bytes()
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab = vocab.from_bytes(vocab_bytes)
|
||||||
|
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||||
|
actual = []
|
||||||
|
for token in doc:
|
||||||
|
actual.append(token.pos_)
|
||||||
|
assert actual == pos
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4190():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||||||
|
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||||||
|
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||||||
|
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||||
|
exceptions = {
|
||||||
|
k: v
|
||||||
|
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||||
|
if not (len(k) == 2 and k[1] == ".")
|
||||||
|
}
|
||||||
|
new_tokenizer = Tokenizer(
|
||||||
|
nlp.vocab,
|
||||||
|
exceptions,
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=nlp.tokenizer.token_match,
|
||||||
|
)
|
||||||
|
nlp.tokenizer = new_tokenizer
|
||||||
|
|
||||||
|
test_string = "Test c."
|
||||||
|
# Load default language
|
||||||
|
nlp_1 = English()
|
||||||
|
doc_1a = nlp_1(test_string)
|
||||||
|
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||||
|
# Modify tokenizer
|
||||||
|
customize_tokenizer(nlp_1)
|
||||||
|
doc_1b = nlp_1(test_string)
|
||||||
|
result_1b = [token.text for token in doc_1b]
|
||||||
|
# Save and Reload
|
||||||
|
with make_tempdir() as model_dir:
|
||||||
|
nlp_1.to_disk(model_dir)
|
||||||
|
nlp_2 = load_model(model_dir)
|
||||||
|
# This should be the modified tokenizer
|
||||||
|
doc_2 = nlp_2(test_string)
|
||||||
|
result_2 = [token.text for token in doc_2]
|
||||||
|
assert result_1b == result_2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4267():
|
||||||
|
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("PEOPLE")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
# assert that we have correct IOB annotations
|
||||||
|
doc1 = nlp("hi")
|
||||||
|
assert doc1.is_nered
|
||||||
|
for token in doc1:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
# add entity ruler and run again
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
assert "entity_ruler" in nlp.pipe_names
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
# assert that we still have correct IOB annotations
|
||||||
|
doc2 = nlp("hi")
|
||||||
|
assert doc2.is_nered
|
||||||
|
for token in doc2:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4272():
|
||||||
|
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||||
|
are available."""
|
||||||
|
nlp = Greek()
|
||||||
|
doc = nlp("Χθες")
|
||||||
|
assert doc[0].lemma_
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_predictions():
|
||||||
|
class DummyPipe(Pipe):
|
||||||
|
def __init__(self):
|
||||||
|
self.model = "dummy_model"
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
return ([1, 2, 3], [4, 5, 6])
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
|
return docs
|
||||||
|
|
||||||
|
nlp = Language()
|
||||||
|
doc = nlp.make_doc("foo")
|
||||||
|
dummy_pipe = DummyPipe()
|
||||||
|
dummy_pipe(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
|
||||||
|
def test_issue4313():
|
||||||
|
""" This should not crash or exit with some strange error code """
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
nlp = English()
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
ner.begin_training([])
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc = nlp("What do you think about Apple ?")
|
||||||
|
assert len(ner.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner.labels
|
||||||
|
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||||
|
doc.ents = list(doc.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# ensure the beam_parse still works with the new label
|
||||||
|
docs = [doc]
|
||||||
|
beams = nlp.entity.beam_parse(
|
||||||
|
docs, beam_width=beam_width, beam_density=beam_density
|
||||||
|
)
|
||||||
|
|
||||||
|
for doc, beam in zip(docs, beams):
|
||||||
|
entity_scores = defaultdict(float)
|
||||||
|
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||||
|
for start, end, label in ents:
|
||||||
|
entity_scores[(start, end, label)] += score
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4348():
|
||||||
|
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||||
|
nlp = English()
|
||||||
|
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||||
|
TRAIN_DATA = [example, example]
|
||||||
|
tagger = nlp.create_pipe("tagger")
|
||||||
|
nlp.add_pipe(tagger)
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(5):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4367():
|
||||||
|
"""Test that docbin init goes well"""
|
||||||
|
DocBin()
|
||||||
|
DocBin(attrs=["LEMMA"])
|
||||||
|
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4373():
|
||||||
|
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||||
|
matcher = Matcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
matcher = PhraseMatcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4402():
|
||||||
|
json_data = {
|
||||||
|
"id": 0,
|
||||||
|
"paragraphs": [
|
||||||
|
{
|
||||||
|
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||||
|
"sentences": [
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 0, "orth": "How", "ner": "O"},
|
||||||
|
{"id": 1, "orth": "should", "ner": "O"},
|
||||||
|
{"id": 2, "orth": "I", "ner": "O"},
|
||||||
|
{"id": 3, "orth": "cook", "ner": "O"},
|
||||||
|
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||||
|
{"id": 5, "orth": "in", "ner": "O"},
|
||||||
|
{"id": 6, "orth": "an", "ner": "O"},
|
||||||
|
{"id": 7, "orth": "oven", "ner": "O"},
|
||||||
|
{"id": 8, "orth": "?", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 9, "orth": "\n", "ner": "O"},
|
||||||
|
{"id": 10, "orth": "I", "ner": "O"},
|
||||||
|
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||||
|
{"id": 12, "orth": "heard", "ner": "O"},
|
||||||
|
{"id": 13, "orth": "of", "ner": "O"},
|
||||||
|
{"id": 14, "orth": "people", "ner": "O"},
|
||||||
|
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||||
|
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||||
|
{"id": 17, "orth": "in", "ner": "O"},
|
||||||
|
{"id": 18, "orth": "an", "ner": "O"},
|
||||||
|
{"id": 19, "orth": "oven", "ner": "O"},
|
||||||
|
{"id": 20, "orth": ".", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"cats": [
|
||||||
|
{"label": "baking", "value": 1.0},
|
||||||
|
{"label": "not_baking", "value": 0.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"raw": "What is the difference between white and brown eggs?\n",
|
||||||
|
"sentences": [
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 0, "orth": "What", "ner": "O"},
|
||||||
|
{"id": 1, "orth": "is", "ner": "O"},
|
||||||
|
{"id": 2, "orth": "the", "ner": "O"},
|
||||||
|
{"id": 3, "orth": "difference", "ner": "O"},
|
||||||
|
{"id": 4, "orth": "between", "ner": "O"},
|
||||||
|
{"id": 5, "orth": "white", "ner": "O"},
|
||||||
|
{"id": 6, "orth": "and", "ner": "O"},
|
||||||
|
{"id": 7, "orth": "brown", "ner": "O"},
|
||||||
|
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||||
|
{"id": 9, "orth": "?", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||||
|
],
|
||||||
|
"cats": [
|
||||||
|
{"label": "baking", "value": 0.0},
|
||||||
|
{"label": "not_baking", "value": 1.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
nlp = English()
|
||||||
|
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
output_file = tmpdir / "test4402.spacy"
|
||||||
|
docs = json2docs([json_data])
|
||||||
|
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||||
|
with output_file.open("wb") as file_:
|
||||||
|
file_.write(data)
|
||||||
|
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||||
|
|
||||||
|
train_data = list(corpus.train_dataset(nlp))
|
||||||
|
assert len(train_data) == 2
|
||||||
|
|
||||||
|
split_train_data = []
|
||||||
|
for eg in train_data:
|
||||||
|
split_train_data.extend(eg.split_sents())
|
||||||
|
assert len(split_train_data) == 4
|
|
@ -1,23 +0,0 @@
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4002(en_vocab):
|
|
||||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
||||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
|
||||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
|
||||||
matcher.add("TEST", [pattern1])
|
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
|
||||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
||||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
|
||||||
pattern2[0].norm_ = "c"
|
|
||||||
pattern2[1].norm_ = "d"
|
|
||||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
|
||||||
matcher.add("TEST", [pattern2])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
|
@ -1,50 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4030():
|
|
||||||
""" Test whether textcat works fine with empty doc """
|
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
|
||||||
x_train = [
|
|
||||||
"This is an offensive text",
|
|
||||||
"This is the second offensive text",
|
|
||||||
"inoff",
|
|
||||||
]
|
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
|
|
||||||
# preparing the data
|
|
||||||
train_data = []
|
|
||||||
for text, train_instance in zip(x_train, y_train):
|
|
||||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
||||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
||||||
|
|
||||||
# add a text categorizer component
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat",
|
|
||||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
||||||
)
|
|
||||||
|
|
||||||
for label in unique_classes:
|
|
||||||
textcat.add_label(label)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
|
|
||||||
# training the network
|
|
||||||
with nlp.select_pipes(enable="textcat"):
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(3):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(
|
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
# processing of an empty doc should result in 0.0 for all categories
|
|
||||||
doc = nlp("")
|
|
||||||
assert doc.cats["offensive"] == 0.0
|
|
||||||
assert doc.cats["inoffensive"] == 0.0
|
|
|
@ -1,85 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4042():
|
|
||||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
# add ner pipe
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_label("SOME_LABEL")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
# Add entity ruler
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [
|
|
||||||
{"label": "MY_ORG", "pattern": "Apple"},
|
|
||||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
|
||||||
]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
|
||||||
doc1 = nlp("What do you think about Apple ?")
|
|
||||||
assert doc1.ents[0].label_ == "MY_ORG"
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
output_dir = ensure_path(d)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
doc2 = nlp2("What do you think about Apple ?")
|
|
||||||
assert doc2.ents[0].label_ == "MY_ORG"
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4042_bug2():
|
|
||||||
"""
|
|
||||||
Test that serialization of an NER works fine when new labels were added.
|
|
||||||
This is the second bug of two bugs underlying the issue 4042.
|
|
||||||
"""
|
|
||||||
nlp1 = English()
|
|
||||||
vocab = nlp1.vocab
|
|
||||||
|
|
||||||
# add ner pipe
|
|
||||||
ner1 = nlp1.create_pipe("ner")
|
|
||||||
ner1.add_label("SOME_LABEL")
|
|
||||||
nlp1.add_pipe(ner1)
|
|
||||||
nlp1.begin_training()
|
|
||||||
|
|
||||||
# add a new label to the doc
|
|
||||||
doc1 = nlp1("What do you think about Apple ?")
|
|
||||||
assert len(ner1.labels) == 1
|
|
||||||
assert "SOME_LABEL" in ner1.labels
|
|
||||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
|
||||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
|
||||||
|
|
||||||
# reapply the NER - at this point it should resize itself
|
|
||||||
ner1(doc1)
|
|
||||||
assert len(ner1.labels) == 2
|
|
||||||
assert "SOME_LABEL" in ner1.labels
|
|
||||||
assert "MY_ORG" in ner1.labels
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
# assert IO goes fine
|
|
||||||
output_dir = ensure_path(d)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
ner1.to_disk(output_dir)
|
|
||||||
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
|
||||||
ner2.from_disk(output_dir)
|
|
||||||
assert len(ner2.labels) == 2
|
|
|
@ -1,30 +0,0 @@
|
||||||
from spacy.vocab import Vocab
|
|
||||||
import spacy
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4054(en_vocab):
|
|
||||||
"""Test that a new blank model can be made with a vocab from file,
|
|
||||||
and that serialization does not drop the language at any point."""
|
|
||||||
nlp1 = English()
|
|
||||||
vocab1 = nlp1.vocab
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
vocab_dir = ensure_path(d / "vocab")
|
|
||||||
if not vocab_dir.exists():
|
|
||||||
vocab_dir.mkdir()
|
|
||||||
vocab1.to_disk(vocab_dir)
|
|
||||||
|
|
||||||
vocab2 = Vocab().from_disk(vocab_dir)
|
|
||||||
print("lang", vocab2.lang)
|
|
||||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
|
||||||
|
|
||||||
nlp_dir = ensure_path(d / "nlp")
|
|
||||||
if not nlp_dir.exists():
|
|
||||||
nlp_dir.mkdir()
|
|
||||||
nlp2.to_disk(nlp_dir)
|
|
||||||
nlp3 = spacy.load(nlp_dir)
|
|
||||||
assert nlp3.lang == "en"
|
|
|
@ -1,23 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4120(en_vocab):
|
|
||||||
"""Test that matches without a final {OP: ?} token are returned."""
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
|
||||||
doc1 = Doc(en_vocab, words=["a"])
|
|
||||||
assert len(matcher(doc1)) == 1 # works
|
|
||||||
|
|
||||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
|
||||||
assert len(matcher(doc2)) == 2 # fixed
|
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
|
||||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
|
||||||
assert len(matcher(doc3)) == 2 # works
|
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
|
||||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
|
||||||
assert len(matcher(doc4)) == 3 # fixed
|
|
|
@ -1,28 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4133(en_vocab):
|
|
||||||
nlp = English()
|
|
||||||
vocab_bytes = nlp.vocab.to_bytes()
|
|
||||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
|
||||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
for i, token in enumerate(doc):
|
|
||||||
token.pos_ = pos[i]
|
|
||||||
|
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
doc_bytes = doc.to_bytes()
|
|
||||||
|
|
||||||
vocab = Vocab()
|
|
||||||
vocab = vocab.from_bytes(vocab_bytes)
|
|
||||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
|
||||||
|
|
||||||
actual = []
|
|
||||||
for token in doc:
|
|
||||||
actual.append(token.pos_)
|
|
||||||
|
|
||||||
assert actual == pos
|
|
|
@ -1,46 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokenizer import Tokenizer
|
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4190():
|
|
||||||
test_string = "Test c."
|
|
||||||
# Load default language
|
|
||||||
nlp_1 = English()
|
|
||||||
doc_1a = nlp_1(test_string)
|
|
||||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
|
||||||
# Modify tokenizer
|
|
||||||
customize_tokenizer(nlp_1)
|
|
||||||
doc_1b = nlp_1(test_string)
|
|
||||||
result_1b = [token.text for token in doc_1b]
|
|
||||||
# Save and Reload
|
|
||||||
with make_tempdir() as model_dir:
|
|
||||||
nlp_1.to_disk(model_dir)
|
|
||||||
nlp_2 = util.load_model(model_dir)
|
|
||||||
# This should be the modified tokenizer
|
|
||||||
doc_2 = nlp_2(test_string)
|
|
||||||
result_2 = [token.text for token in doc_2]
|
|
||||||
assert result_1b == result_2
|
|
||||||
|
|
||||||
|
|
||||||
def customize_tokenizer(nlp):
|
|
||||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
|
||||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
|
||||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
|
||||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
|
||||||
exceptions = {
|
|
||||||
k: v
|
|
||||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
|
||||||
if not (len(k) == 2 and k[1] == ".")
|
|
||||||
}
|
|
||||||
new_tokenizer = Tokenizer(
|
|
||||||
nlp.vocab,
|
|
||||||
exceptions,
|
|
||||||
prefix_search=prefix_re.search,
|
|
||||||
suffix_search=suffix_re.search,
|
|
||||||
infix_finditer=infix_re.finditer,
|
|
||||||
token_match=nlp.tokenizer.token_match,
|
|
||||||
)
|
|
||||||
nlp.tokenizer = new_tokenizer
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4267():
|
|
||||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_label("PEOPLE")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
assert "ner" in nlp.pipe_names
|
|
||||||
|
|
||||||
# assert that we have correct IOB annotations
|
|
||||||
doc1 = nlp("hi")
|
|
||||||
assert doc1.is_nered
|
|
||||||
for token in doc1:
|
|
||||||
assert token.ent_iob == 2
|
|
||||||
|
|
||||||
# add entity ruler and run again
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
|
||||||
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
assert "entity_ruler" in nlp.pipe_names
|
|
||||||
assert "ner" in nlp.pipe_names
|
|
||||||
|
|
||||||
# assert that we still have correct IOB annotations
|
|
||||||
doc2 = nlp("hi")
|
|
||||||
assert doc2.is_nered
|
|
||||||
for token in doc2:
|
|
||||||
assert token.ent_iob == 2
|
|
|
@ -1,9 +0,0 @@
|
||||||
from spacy.lang.el import Greek
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4272():
|
|
||||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
|
||||||
are available."""
|
|
||||||
nlp = Greek()
|
|
||||||
doc = nlp("Χθες")
|
|
||||||
assert doc[0].lemma_
|
|
|
@ -1,25 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.pipeline import Pipe
|
|
||||||
|
|
||||||
|
|
||||||
class DummyPipe(Pipe):
|
|
||||||
def __init__(self):
|
|
||||||
self.model = "dummy_model"
|
|
||||||
|
|
||||||
def predict(self, docs):
|
|
||||||
return ([1, 2, 3], [4, 5, 6])
|
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def nlp():
|
|
||||||
return Language()
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_predictions(nlp):
|
|
||||||
doc = nlp.make_doc("foo")
|
|
||||||
dummy_pipe = DummyPipe()
|
|
||||||
dummy_pipe(doc)
|
|
|
@ -1,47 +0,0 @@
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
|
||||||
from spacy.pipeline import EntityRecognizer
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span
|
|
||||||
|
|
||||||
|
|
||||||
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
|
||||||
@pytest.mark.skip
|
|
||||||
def test_issue4313():
|
|
||||||
""" This should not crash or exit with some strange error code """
|
|
||||||
beam_width = 16
|
|
||||||
beam_density = 0.0001
|
|
||||||
nlp = English()
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
|
||||||
ner.add_label("SOME_LABEL")
|
|
||||||
ner.begin_training([])
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
|
|
||||||
# add a new label to the doc
|
|
||||||
doc = nlp("What do you think about Apple ?")
|
|
||||||
assert len(ner.labels) == 1
|
|
||||||
assert "SOME_LABEL" in ner.labels
|
|
||||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
|
||||||
doc.ents = list(doc.ents) + [apple_ent]
|
|
||||||
|
|
||||||
# ensure the beam_parse still works with the new label
|
|
||||||
docs = [doc]
|
|
||||||
beams = nlp.entity.beam_parse(
|
|
||||||
docs, beam_width=beam_width, beam_density=beam_density
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc, beam in zip(docs, beams):
|
|
||||||
entity_scores = defaultdict(float)
|
|
||||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
|
||||||
for start, end, label in ents:
|
|
||||||
entity_scores[(start, end, label)] += score
|
|
|
@ -1,24 +0,0 @@
|
||||||
from spacy.gold import Example
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4348():
|
|
||||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
|
||||||
TRAIN_DATA = [example, example]
|
|
||||||
|
|
||||||
tagger = nlp.create_pipe("tagger")
|
|
||||||
nlp.add_pipe(tagger)
|
|
||||||
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(5):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.tokens import DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4367():
|
|
||||||
"""Test that docbin init goes well"""
|
|
||||||
DocBin()
|
|
||||||
DocBin(attrs=["LEMMA"])
|
|
||||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
|
|
@ -1,10 +0,0 @@
|
||||||
from spacy.matcher import Matcher, PhraseMatcher
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4373():
|
|
||||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
|
||||||
matcher = Matcher(Vocab())
|
|
||||||
assert isinstance(matcher.vocab, Vocab)
|
|
||||||
matcher = PhraseMatcher(Vocab())
|
|
||||||
assert isinstance(matcher.vocab, Vocab)
|
|
|
@ -1,98 +0,0 @@
|
||||||
from spacy.gold import Corpus
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
from ...gold.converters import json2docs
|
|
||||||
from ...tokens import DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4402():
|
|
||||||
nlp = English()
|
|
||||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
output_file = tmpdir / "test4402.spacy"
|
|
||||||
docs = json2docs([json_data])
|
|
||||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
|
||||||
with output_file.open("wb") as file_:
|
|
||||||
file_.write(data)
|
|
||||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
|
||||||
|
|
||||||
train_data = list(corpus.train_dataset(nlp))
|
|
||||||
assert len(train_data) == 2
|
|
||||||
|
|
||||||
split_train_data = []
|
|
||||||
for eg in train_data:
|
|
||||||
split_train_data.extend(eg.split_sents())
|
|
||||||
assert len(split_train_data) == 4
|
|
||||||
|
|
||||||
|
|
||||||
json_data = {
|
|
||||||
"id": 0,
|
|
||||||
"paragraphs": [
|
|
||||||
{
|
|
||||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 0, "orth": "How", "ner": "O"},
|
|
||||||
{"id": 1, "orth": "should", "ner": "O"},
|
|
||||||
{"id": 2, "orth": "I", "ner": "O"},
|
|
||||||
{"id": 3, "orth": "cook", "ner": "O"},
|
|
||||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
|
||||||
{"id": 5, "orth": "in", "ner": "O"},
|
|
||||||
{"id": 6, "orth": "an", "ner": "O"},
|
|
||||||
{"id": 7, "orth": "oven", "ner": "O"},
|
|
||||||
{"id": 8, "orth": "?", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 9, "orth": "\n", "ner": "O"},
|
|
||||||
{"id": 10, "orth": "I", "ner": "O"},
|
|
||||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
|
||||||
{"id": 12, "orth": "heard", "ner": "O"},
|
|
||||||
{"id": 13, "orth": "of", "ner": "O"},
|
|
||||||
{"id": 14, "orth": "people", "ner": "O"},
|
|
||||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
|
||||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
|
||||||
{"id": 17, "orth": "in", "ner": "O"},
|
|
||||||
{"id": 18, "orth": "an", "ner": "O"},
|
|
||||||
{"id": 19, "orth": "oven", "ner": "O"},
|
|
||||||
{"id": 20, "orth": ".", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"cats": [
|
|
||||||
{"label": "baking", "value": 1.0},
|
|
||||||
{"label": "not_baking", "value": 0.0},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"raw": "What is the difference between white and brown eggs?\n",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 0, "orth": "What", "ner": "O"},
|
|
||||||
{"id": 1, "orth": "is", "ner": "O"},
|
|
||||||
{"id": 2, "orth": "the", "ner": "O"},
|
|
||||||
{"id": 3, "orth": "difference", "ner": "O"},
|
|
||||||
{"id": 4, "orth": "between", "ner": "O"},
|
|
||||||
{"id": 5, "orth": "white", "ner": "O"},
|
|
||||||
{"id": 6, "orth": "and", "ner": "O"},
|
|
||||||
{"id": 7, "orth": "brown", "ner": "O"},
|
|
||||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
|
||||||
{"id": 9, "orth": "?", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
|
||||||
],
|
|
||||||
"cats": [
|
|
||||||
{"label": "baking", "value": 0.0},
|
|
||||||
{"label": "not_baking", "value": 1.0},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
import pytest
|
||||||
|
from mock import Mock
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.matcher import DependencyMatcher
|
||||||
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
|
from spacy.gold import Example
|
||||||
|
from spacy.gold.converters.conllu2docs import conllu2docs
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.kb import KnowledgeBase
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.util import ensure_path, load_model_from_path
|
||||||
|
import numpy
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from ..util import get_doc, make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4528(en_vocab):
|
||||||
|
"""Test that user_data is correctly serialized in DocBin."""
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
doc.user_data["foo"] = "bar"
|
||||||
|
# This is how extension attribute values are stored in the user data
|
||||||
|
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||||
|
doc_bin = DocBin(store_user_data=True)
|
||||||
|
doc_bin.add(doc)
|
||||||
|
doc_bin_bytes = doc_bin.to_bytes()
|
||||||
|
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||||
|
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||||
|
assert new_doc.user_data["foo"] == "bar"
|
||||||
|
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||||
|
)
|
||||||
|
def test_gold_misaligned(en_tokenizer, text, words):
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
Example.from_dict(doc, {"words": words})
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4590(en_vocab):
|
||||||
|
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||||
|
pattern = [
|
||||||
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
on_match = Mock()
|
||||||
|
matcher = DependencyMatcher(en_vocab)
|
||||||
|
matcher.add("pattern", on_match, pattern)
|
||||||
|
text = "The quick brown fox jumped over the lazy fox"
|
||||||
|
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
|
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||||
|
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||||
|
matches = matcher(doc)
|
||||||
|
on_match_args = on_match.call_args
|
||||||
|
assert on_match_args[0][3] == matches
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4651_with_phrase_matcher_attr():
|
||||||
|
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||||
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||||
|
specified.
|
||||||
|
"""
|
||||||
|
text = "Spacy is a python library for nlp"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||||
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||||
|
nlp_reloaded = English()
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "entityruler"
|
||||||
|
ruler.to_disk(file_path)
|
||||||
|
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||||
|
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||||
|
doc_reloaded = nlp_reloaded(text)
|
||||||
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||||
|
assert res == res_reloaded
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4651_without_phrase_matcher_attr():
|
||||||
|
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||||
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||||
|
not specified.
|
||||||
|
"""
|
||||||
|
text = "Spacy is a python library for nlp"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||||
|
nlp_reloaded = English()
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "entityruler"
|
||||||
|
ruler.to_disk(file_path)
|
||||||
|
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||||
|
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||||
|
doc_reloaded = nlp_reloaded(text)
|
||||||
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||||
|
assert res == res_reloaded
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4665():
|
||||||
|
"""
|
||||||
|
conllu2json should not raise an exception if the HEAD column contains an
|
||||||
|
underscore
|
||||||
|
"""
|
||||||
|
input_data = """
|
||||||
|
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||||
|
2 This _ DET DT _ _ det _ _
|
||||||
|
3 killing _ NOUN NN _ _ nsubj _ _
|
||||||
|
4 of _ ADP IN _ _ case _ _
|
||||||
|
5 a _ DET DT _ _ det _ _
|
||||||
|
6 respected _ ADJ JJ _ _ amod _ _
|
||||||
|
7 cleric _ NOUN NN _ _ nmod _ _
|
||||||
|
8 will _ AUX MD _ _ aux _ _
|
||||||
|
9 be _ AUX VB _ _ aux _ _
|
||||||
|
10 causing _ VERB VBG _ _ root _ _
|
||||||
|
11 us _ PRON PRP _ _ iobj _ _
|
||||||
|
12 trouble _ NOUN NN _ _ dobj _ _
|
||||||
|
13 for _ ADP IN _ _ case _ _
|
||||||
|
14 years _ NOUN NNS _ _ nmod _ _
|
||||||
|
15 to _ PART TO _ _ mark _ _
|
||||||
|
16 come _ VERB VB _ _ acl _ _
|
||||||
|
17 . _ PUNCT . _ _ punct _ _
|
||||||
|
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||||
|
"""
|
||||||
|
conllu2docs(input_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4674():
|
||||||
|
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||||
|
nlp = English()
|
||||||
|
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||||
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
kb.set_entities(
|
||||||
|
entity_list=["Q1", "Q1"],
|
||||||
|
freq_list=[32, 111],
|
||||||
|
vector_list=[vector1, vector2],
|
||||||
|
)
|
||||||
|
assert kb.get_size_entities() == 1
|
||||||
|
# dumping to file & loading back in
|
||||||
|
with make_tempdir() as d:
|
||||||
|
dir_path = ensure_path(d)
|
||||||
|
if not dir_path.exists():
|
||||||
|
dir_path.mkdir()
|
||||||
|
file_path = dir_path / "kb"
|
||||||
|
kb.dump(str(file_path))
|
||||||
|
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||||
|
kb2.load_bulk(str(file_path))
|
||||||
|
assert kb2.get_size_entities() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4707():
|
||||||
|
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||||
|
by default when loading a model.
|
||||||
|
"""
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||||
|
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||||
|
exclude = ["tokenizer", "sentencizer"]
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
nlp.to_disk(tmpdir, exclude=exclude)
|
||||||
|
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||||
|
assert "sentencizer" not in new_nlp.pipe_names
|
||||||
|
assert "entity_ruler" in new_nlp.pipe_names
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4725_1():
|
||||||
|
""" Ensure the pickling of the NER goes well"""
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||||
|
with make_tempdir() as tmp_path:
|
||||||
|
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||||
|
pickle.dump(ner, file_)
|
||||||
|
assert ner.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||||
|
ner2 = pickle.load(file_)
|
||||||
|
assert ner2.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4725_2():
|
||||||
|
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||||
|
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
data = numpy.ndarray((5, 3), dtype="f")
|
||||||
|
data[0] = 1.0
|
||||||
|
data[1] = 2.0
|
||||||
|
vocab.set_vector("cat", data[0])
|
||||||
|
vocab.set_vector("dog", data[1])
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
docs = ["Kurt is in London."] * 10
|
||||||
|
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4849():
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(
|
||||||
|
nlp,
|
||||||
|
patterns=[
|
||||||
|
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||||
|
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||||
|
],
|
||||||
|
phrase_matcher_attr="LOWER",
|
||||||
|
)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
text = """
|
||||||
|
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||||
|
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||||
|
"""
|
||||||
|
# USING 1 PROCESS
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=1):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert count_ents == 2
|
||||||
|
# USING 2 PROCESSES
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=2):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert count_ents == 2
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPipe:
|
||||||
|
name = "my_pipe"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||||
|
Doc.set_extension("my_ext", default=None)
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
gathered_ext = []
|
||||||
|
for sent in doc.sents:
|
||||||
|
sent_ext = self._get_my_ext(sent)
|
||||||
|
sent._.set("my_ext", sent_ext)
|
||||||
|
gathered_ext.append(sent_ext)
|
||||||
|
|
||||||
|
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_my_ext(span):
|
||||||
|
return str(span.end)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4903():
|
||||||
|
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
|
||||||
|
macOS."""
|
||||||
|
nlp = English()
|
||||||
|
custom_component = CustomPipe()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(custom_component, after="sentencizer")
|
||||||
|
|
||||||
|
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||||
|
docs = list(nlp.pipe(text, n_process=2))
|
||||||
|
assert docs[0].text == "I like bananas."
|
||||||
|
assert docs[1].text == "Do you like them?"
|
||||||
|
assert docs[2].text == "No, I prefer wasabi."
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4924():
|
||||||
|
nlp = Language()
|
||||||
|
example = Example.from_dict(nlp.make_doc(""), {})
|
||||||
|
nlp.evaluate([example])
|
|
@ -1,16 +0,0 @@
|
||||||
from spacy.tokens import Doc, DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4528(en_vocab):
|
|
||||||
"""Test that user_data is correctly serialized in DocBin."""
|
|
||||||
doc = Doc(en_vocab, words=["hello", "world"])
|
|
||||||
doc.user_data["foo"] = "bar"
|
|
||||||
# This is how extension attribute values are stored in the user data
|
|
||||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
|
||||||
doc_bin = DocBin(store_user_data=True)
|
|
||||||
doc_bin.add(doc)
|
|
||||||
doc_bin_bytes = doc_bin.to_bytes()
|
|
||||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
|
||||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
|
||||||
assert new_doc.user_data["foo"] == "bar"
|
|
||||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
|
|
@ -1,11 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
|
||||||
)
|
|
||||||
def test_gold_misaligned(en_tokenizer, text, words):
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
Example.from_dict(doc, {"words": words})
|
|
|
@ -1,35 +0,0 @@
|
||||||
from mock import Mock
|
|
||||||
from spacy.matcher import DependencyMatcher
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4590(en_vocab):
|
|
||||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
|
||||||
pattern = [
|
|
||||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
|
||||||
{
|
|
||||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
|
||||||
"PATTERN": {"ORTH": "fox"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
|
||||||
"PATTERN": {"ORTH": "fox"},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
on_match = Mock()
|
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
|
||||||
matcher.add("pattern", on_match, pattern)
|
|
||||||
|
|
||||||
text = "The quick brown fox jumped over the lazy fox"
|
|
||||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
|
||||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
|
||||||
|
|
||||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
|
||||||
|
|
||||||
matches = matcher(doc)
|
|
||||||
|
|
||||||
on_match_args = on_match.call_args
|
|
||||||
|
|
||||||
assert on_match_args[0][3] == matches
|
|
|
@ -1,62 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4651_with_phrase_matcher_attr():
|
|
||||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
|
||||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
||||||
specified.
|
|
||||||
"""
|
|
||||||
text = "Spacy is a python library for nlp"
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
|
||||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
||||||
|
|
||||||
nlp_reloaded = English()
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "entityruler"
|
|
||||||
ruler.to_disk(file_path)
|
|
||||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
|
||||||
|
|
||||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
|
||||||
doc_reloaded = nlp_reloaded(text)
|
|
||||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
||||||
|
|
||||||
assert res == res_reloaded
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4651_without_phrase_matcher_attr():
|
|
||||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
|
||||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
||||||
not specified.
|
|
||||||
"""
|
|
||||||
text = "Spacy is a python library for nlp"
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
||||||
|
|
||||||
nlp_reloaded = English()
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "entityruler"
|
|
||||||
ruler.to_disk(file_path)
|
|
||||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
|
||||||
|
|
||||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
|
||||||
doc_reloaded = nlp_reloaded(text)
|
|
||||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
||||||
|
|
||||||
assert res == res_reloaded
|
|
|
@ -1,35 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
|
||||||
|
|
||||||
input_data = """
|
|
||||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
|
||||||
2 This _ DET DT _ _ det _ _
|
|
||||||
3 killing _ NOUN NN _ _ nsubj _ _
|
|
||||||
4 of _ ADP IN _ _ case _ _
|
|
||||||
5 a _ DET DT _ _ det _ _
|
|
||||||
6 respected _ ADJ JJ _ _ amod _ _
|
|
||||||
7 cleric _ NOUN NN _ _ nmod _ _
|
|
||||||
8 will _ AUX MD _ _ aux _ _
|
|
||||||
9 be _ AUX VB _ _ aux _ _
|
|
||||||
10 causing _ VERB VBG _ _ root _ _
|
|
||||||
11 us _ PRON PRP _ _ iobj _ _
|
|
||||||
12 trouble _ NOUN NN _ _ dobj _ _
|
|
||||||
13 for _ ADP IN _ _ case _ _
|
|
||||||
14 years _ NOUN NNS _ _ nmod _ _
|
|
||||||
15 to _ PART TO _ _ mark _ _
|
|
||||||
16 come _ VERB VB _ _ acl _ _
|
|
||||||
17 . _ PUNCT . _ _ punct _ _
|
|
||||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue4665():
|
|
||||||
"""
|
|
||||||
conllu2json should not raise an exception if the HEAD column contains an
|
|
||||||
underscore
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
# conllu2json(input_data)
|
|
|
@ -1,36 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4674():
|
|
||||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
|
||||||
nlp = English()
|
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
|
||||||
|
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
|
||||||
with pytest.warns(UserWarning):
|
|
||||||
kb.set_entities(
|
|
||||||
entity_list=["Q1", "Q1"],
|
|
||||||
freq_list=[32, 111],
|
|
||||||
vector_list=[vector1, vector2],
|
|
||||||
)
|
|
||||||
|
|
||||||
assert kb.get_size_entities() == 1
|
|
||||||
|
|
||||||
# dumping to file & loading back in
|
|
||||||
with make_tempdir() as d:
|
|
||||||
dir_path = ensure_path(d)
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir()
|
|
||||||
file_path = dir_path / "kb"
|
|
||||||
kb.dump(str(file_path))
|
|
||||||
|
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
|
||||||
kb2.load_bulk(str(file_path))
|
|
||||||
|
|
||||||
assert kb2.get_size_entities() == 1
|
|
|
@ -1,20 +0,0 @@
|
||||||
from spacy.util import load_model_from_path
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4707():
|
|
||||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
|
||||||
by default when loading a model.
|
|
||||||
"""
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
|
||||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
|
||||||
exclude = ["tokenizer", "sentencizer"]
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
nlp.to_disk(tmpdir, exclude=exclude)
|
|
||||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
|
||||||
assert "sentencizer" not in new_nlp.pipe_names
|
|
||||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
|
@ -1,41 +0,0 @@
|
||||||
import pickle
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
from spacy.tests.util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_pickle_ner():
|
|
||||||
""" Ensure the pickling of the NER goes well"""
|
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
|
||||||
nlp = English(vocab=vocab)
|
|
||||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
|
||||||
with make_tempdir() as tmp_path:
|
|
||||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
|
||||||
pickle.dump(ner, file_)
|
|
||||||
assert ner.cfg["min_action_freq"] == 342
|
|
||||||
|
|
||||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
|
||||||
ner2 = pickle.load(file_)
|
|
||||||
assert ner2.cfg["min_action_freq"] == 342
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4725():
|
|
||||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
|
||||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
|
||||||
data = numpy.ndarray((5, 3), dtype="f")
|
|
||||||
data[0] = 1.0
|
|
||||||
data[1] = 2.0
|
|
||||||
vocab.set_vector("cat", data[0])
|
|
||||||
vocab.set_vector("dog", data[1])
|
|
||||||
|
|
||||||
nlp = English(vocab=vocab)
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
docs = ["Kurt is in London."] * 10
|
|
||||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
|
||||||
pass
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4849():
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
ruler = EntityRuler(
|
|
||||||
nlp,
|
|
||||||
patterns=[
|
|
||||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
|
||||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
|
||||||
],
|
|
||||||
phrase_matcher_attr="LOWER",
|
|
||||||
)
|
|
||||||
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
text = """
|
|
||||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
|
||||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
|
||||||
"""
|
|
||||||
|
|
||||||
# USING 1 PROCESS
|
|
||||||
count_ents = 0
|
|
||||||
for doc in nlp.pipe([text], n_process=1):
|
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
|
||||||
assert count_ents == 2
|
|
||||||
|
|
||||||
# USING 2 PROCESSES
|
|
||||||
count_ents = 0
|
|
||||||
for doc in nlp.pipe([text], n_process=2):
|
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
|
||||||
assert count_ents == 2
|
|
|
@ -1,40 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span, Doc
|
|
||||||
|
|
||||||
|
|
||||||
class CustomPipe:
|
|
||||||
name = "my_pipe"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
|
||||||
Doc.set_extension("my_ext", default=None)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
gathered_ext = []
|
|
||||||
for sent in doc.sents:
|
|
||||||
sent_ext = self._get_my_ext(sent)
|
|
||||||
sent._.set("my_ext", sent_ext)
|
|
||||||
gathered_ext.append(sent_ext)
|
|
||||||
|
|
||||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_my_ext(span):
|
|
||||||
return str(span.end)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4903():
|
|
||||||
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
custom_component = CustomPipe()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(custom_component, after="sentencizer")
|
|
||||||
|
|
||||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
|
||||||
docs = list(nlp.pipe(text, n_process=2))
|
|
||||||
assert docs[0].text == "I like bananas."
|
|
||||||
assert docs[1].text == "Do you like them?"
|
|
||||||
assert docs[2].text == "No, I prefer wasabi."
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.gold import Example
|
|
||||||
from spacy.language import Language
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4924():
|
|
||||||
nlp = Language()
|
|
||||||
example = Example.from_dict(nlp.make_doc(""), {})
|
|
||||||
nlp.evaluate([example])
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue5152():
|
def test_issue5152():
|
||||||
# Test that the comparison between a Span and a Token, goes well
|
# Test that the comparison between a Span and a Token, goes well
|
||||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||||
|
@ -8,7 +10,6 @@ def test_issue5152():
|
||||||
text = nlp("Talk about being boring!")
|
text = nlp("Talk about being boring!")
|
||||||
text_var = nlp("Talk of being boring!")
|
text_var = nlp("Talk of being boring!")
|
||||||
y = nlp("Let")
|
y = nlp("Let")
|
||||||
|
|
||||||
span = text[0:3] # Talk about being
|
span = text[0:3] # Talk about being
|
||||||
span_2 = text[0:3] # Talk about being
|
span_2 = text[0:3] # Talk about being
|
||||||
span_3 = text_var[0:3] # Talk of being
|
span_3 = text_var[0:3] # Talk of being
|
||||||
|
|
|
@ -63,7 +63,8 @@ def tagger():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
tagger.begin_training(pipeline=nlp.pipeline)
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training(pipeline=nlp.pipeline)
|
||||||
return tagger
|
return tagger
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
from spacy.errors import AlignmentError
|
from spacy.errors import AlignmentError
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
|
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||||
from spacy.gold import Corpus, docs_to_json
|
from spacy.gold import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.gold.example import Example
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.gold.converters import json2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding
|
||||||
|
@ -271,75 +272,76 @@ def test_split_sentences(en_vocab):
|
||||||
assert split_examples[1].text == "had loads of fun "
|
assert split_examples[1].text == "had loads of fun "
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
|
|
||||||
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||||
words = ["I", "flew to", "San Francisco Valley", "."]
|
words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
spaces = [True, True, False, False]
|
spaces = [True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
|
gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("I "), len("I flew to"), "ORG"),
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
|
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("I "), len("I flew"), "ORG"),
|
(len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
|
||||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", None, "U-LOC", "O"]
|
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||||
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
spaces = [True, True, True, True, True, True, True, False, False]
|
||||||
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
|
gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
entities = [
|
||||||
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||||
|
words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
|
gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("I "), len("I flew to"), "ORG"),
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
|
|
||||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
|
||||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
|
||||||
spaces = [True, True, True, False, False]
|
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
|
||||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
|
||||||
ner_tags = example.get_aligned_ner()
|
|
||||||
assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
|
|
||||||
|
|
||||||
entities = [
|
|
||||||
(len("I "), len("I flew to"), "ORG"),
|
|
||||||
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
|
||||||
]
|
|
||||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
|
||||||
ner_tags = example.get_aligned_ner()
|
|
||||||
assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
||||||
|
@ -349,7 +351,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
||||||
"I flew to San Francisco Valley.",
|
"I flew to San Francisco Valley.",
|
||||||
)
|
)
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
prefix = "I flew to "
|
||||||
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
|
gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
|
||||||
gold_spaces = [True, True, False, True, False, False]
|
gold_spaces = [True, True, False, True, False, False]
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(
|
||||||
|
@ -405,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
|
||||||
assert spans[1].label_ == "GPE"
|
assert spans[1].label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
|
||||||
|
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
|
||||||
|
spaces = [True, True, True, False, False]
|
||||||
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
ents_ref = example.reference.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
|
||||||
|
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
|
||||||
|
text = "Mr and Mrs Smith flew to San Francisco Valley"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
|
||||||
|
{"label": "LOC", "pattern": "San Francisco Valley"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
|
||||||
|
ents_pred = example.predicted.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
|
||||||
|
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_ner_missing_tags(en_tokenizer):
|
def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||||
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
|
@ -412,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_projectivize(en_tokenizer):
|
||||||
|
doc = en_tokenizer("He pretty quickly walks away")
|
||||||
|
heads = [3, 2, 3, 0, 2]
|
||||||
|
example = Example.from_dict(doc, {"heads": heads})
|
||||||
|
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
|
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||||
|
assert proj_heads == [3, 2, 3, 0, 3]
|
||||||
|
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||||
|
|
||||||
|
|
||||||
def test_iob_to_biluo():
|
def test_iob_to_biluo():
|
||||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||||
|
@ -514,6 +570,7 @@ def test_make_orth_variants(doc):
|
||||||
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip("Outdated")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"tokens_a,tokens_b,expected",
|
"tokens_a,tokens_b,expected",
|
||||||
[
|
[
|
||||||
|
@ -537,12 +594,12 @@ def test_make_orth_variants(doc):
|
||||||
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_align(tokens_a, tokens_b, expected):
|
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
|
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa
|
||||||
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
|
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa
|
||||||
# check symmetry
|
# check symmetry
|
||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
|
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa
|
||||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
|
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa
|
||||||
|
|
||||||
|
|
||||||
def test_goldparse_startswith_space(en_tokenizer):
|
def test_goldparse_startswith_space(en_tokenizer):
|
||||||
|
@ -556,7 +613,7 @@ def test_goldparse_startswith_space(en_tokenizer):
|
||||||
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
|
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
|
||||||
)
|
)
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == [None, "U-DATE"]
|
assert ner_tags == ["O", "U-DATE"]
|
||||||
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ def test_aligned_tags():
|
||||||
predicted = Doc(vocab, words=pred_words)
|
predicted = Doc(vocab, words=pred_words)
|
||||||
example = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
aligned_tags = example.get_aligned("tag", as_string=True)
|
aligned_tags = example.get_aligned("tag", as_string=True)
|
||||||
assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
|
assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
|
||||||
|
|
||||||
|
|
||||||
def test_aligned_tags_multi():
|
def test_aligned_tags_multi():
|
||||||
|
|
31
spacy/tests/test_projects.py
Normal file
31
spacy/tests/test_projects.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.cli.project.util import validate_project_commands
|
||||||
|
from spacy.schemas import ProjectConfigSchema, validate
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"config",
|
||||||
|
[
|
||||||
|
{"commands": [{"name": "a"}, {"name": "a"}]},
|
||||||
|
{"commands": [{"name": "a"}], "workflows": {"a": []}},
|
||||||
|
{"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_project_config_validation1(config):
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
validate_project_commands(config)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"config,n_errors",
|
||||||
|
[
|
||||||
|
({"commands": {"a": []}}, 1),
|
||||||
|
({"commands": [{"help": "..."}]}, 1),
|
||||||
|
({"commands": [{"name": "a", "extra": "b"}]}, 1),
|
||||||
|
({"commands": [{"extra": "b"}]}, 2),
|
||||||
|
({"commands": [{"name": "a", "deps": [123]}]}, 1),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_project_config_validation2(config, n_errors):
|
||||||
|
errors = validate(ProjectConfigSchema, config)
|
||||||
|
assert len(errors) == n_errors
|
|
@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]:
|
||||||
return shlex.split(command, posix=not is_windows)
|
return shlex.split(command, posix=not is_windows)
|
||||||
|
|
||||||
|
|
||||||
|
def join_command(command: List[str]) -> str:
|
||||||
|
"""Join a command using shlex. shlex.join is only available for Python 3.8+,
|
||||||
|
so we're using a workaround here.
|
||||||
|
|
||||||
|
command (List[str]): The command to join.
|
||||||
|
RETURNS (str): The joined command
|
||||||
|
"""
|
||||||
|
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||||
|
|
||||||
|
|
||||||
def run_command(command: Union[str, List[str]]) -> None:
|
def run_command(command: Union[str, List[str]]) -> None:
|
||||||
"""Run a command on the command line as a subprocess. If the subprocess
|
"""Run a command on the command line as a subprocess. If the subprocess
|
||||||
returns a non-zero exit code, a system exit is performed.
|
returns a non-zero exit code, a system exit is performed.
|
||||||
|
@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def is_cwd(path: Union[Path, str]) -> bool:
|
||||||
|
"""Check whether a path is the current working directory.
|
||||||
|
|
||||||
|
path (Union[Path, str]): The directory path.
|
||||||
|
RETURNS (bool): Whether the path is the current working directory.
|
||||||
|
"""
|
||||||
|
return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter():
|
def is_in_jupyter():
|
||||||
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
||||||
IPython kernel. Mainly used for the displaCy visualizer.
|
IPython kernel. Mainly used for the displaCy visualizer.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user