mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-16 03:20:34 +03:00
Merge remote-tracking branch 'origin/develop' into rliaw-develop
This commit is contained in:
commit
3bccf8b954
|
@ -5,16 +5,16 @@
|
|||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 5000
|
||||
max_length = 3000
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.2
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
patience = 100000
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 500
|
||||
max_steps = 0
|
||||
eval_frequency = 1000
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
|
@ -26,6 +26,7 @@ score_weights = {"ents_f": 1.0}
|
|||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "words"
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
@ -37,19 +38,13 @@ compound = 1.001
|
|||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = false
|
||||
L2 = 1e-6
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
#[optimizer.learn_rate]
|
||||
#@schedules = "warmup_linear.v1"
|
||||
#warmup_steps = 250
|
||||
#total_steps = 20000
|
||||
#initial_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
@ -58,8 +53,6 @@ vectors = null
|
|||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a2"
|
||||
__version__ = "3.0.0a4"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||
|
|
|
@ -11,12 +11,15 @@ from .profile import profile # noqa: F401
|
|||
from .train import train_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project import project_clone, project_assets, project_run # noqa: F401
|
||||
from .project import project_run_all # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
|
|||
|
||||
DOCS: https://spacy.io/api/cli
|
||||
"""
|
||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
|
||||
project templates. You'd typically start by cloning a project template to a local
|
||||
directory and fetching its assets like datasets etc. See the project's
|
||||
project.yml for the available commands.
|
||||
"""
|
||||
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
app.add_typer(project_cli)
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
|
|
@ -120,8 +120,12 @@ def convert(
|
|||
no_print=silent,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
if file_type == "json":
|
||||
data = [docs_to_json(docs)]
|
||||
else:
|
||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||
if output_dir == "-":
|
||||
_print_docs_to_stdout(docs, file_type)
|
||||
_print_docs_to_stdout(data, file_type)
|
||||
else:
|
||||
if input_loc != input_path:
|
||||
subpath = input_loc.relative_to(input_path)
|
||||
|
@ -129,24 +133,23 @@ def convert(
|
|||
else:
|
||||
output_file = Path(output_dir) / input_loc.parts[-1]
|
||||
output_file = output_file.with_suffix(f".{file_type}")
|
||||
_write_docs_to_file(docs, output_file, file_type)
|
||||
_write_docs_to_file(data, output_file, file_type)
|
||||
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
||||
|
||||
|
||||
def _print_docs_to_stdout(docs, output_type):
|
||||
def _print_docs_to_stdout(data, output_type):
|
||||
if output_type == "json":
|
||||
srsly.write_json("-", [docs_to_json(docs)])
|
||||
srsly.write_json("-", data)
|
||||
else:
|
||||
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
||||
sys.stdout.buffer.write(data)
|
||||
|
||||
|
||||
def _write_docs_to_file(docs, output_file, output_type):
|
||||
def _write_docs_to_file(data, output_file, output_type):
|
||||
if not output_file.parent.exists():
|
||||
output_file.parent.mkdir(parents=True)
|
||||
if output_type == "json":
|
||||
srsly.write_json(output_file, [docs_to_json(docs)])
|
||||
srsly.write_json(output_file, data)
|
||||
else:
|
||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
|
||||
|
|
168
spacy/cli/debug_model.py
Normal file
168
spacy/cli/debug_model.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
from typing import List
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from ._app import app, Arg, Opt
|
||||
from .. import util
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from ..lang.en import English
|
||||
|
||||
|
||||
@app.command("debug-model")
|
||||
def debug_model_cli(
|
||||
# fmt: off
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
|
||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
||||
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
||||
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
|
||||
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
|
||||
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
||||
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
||||
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Analyze a Thinc ML model - internal structure and activations during training
|
||||
"""
|
||||
print_settings = {
|
||||
"dimensions": dimensions,
|
||||
"parameters": parameters,
|
||||
"gradients": gradients,
|
||||
"attributes": attributes,
|
||||
"layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
|
||||
"print_before_training": P0,
|
||||
"print_after_init": P1,
|
||||
"print_after_training": P2,
|
||||
"print_prediction": P3,
|
||||
}
|
||||
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info(f"Using CPU")
|
||||
|
||||
debug_model(
|
||||
config_path,
|
||||
print_settings=print_settings,
|
||||
)
|
||||
|
||||
|
||||
def debug_model(
|
||||
config_path: Path,
|
||||
*,
|
||||
print_settings=None
|
||||
):
|
||||
if print_settings is None:
|
||||
print_settings = {}
|
||||
|
||||
model = util.load_config(config_path, create_objects=True)["model"]
|
||||
|
||||
# STEP 0: Printing before training
|
||||
msg.info(f"Analysing model with ID {model.id}")
|
||||
if print_settings.get("print_before_training"):
|
||||
msg.info(f"Before training:")
|
||||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 1: Initializing the model and printing again
|
||||
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
|
||||
if print_settings.get("print_after_init"):
|
||||
msg.info(f"After initialization:")
|
||||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 2: Updating the model and printing again
|
||||
optimizer = Adam(0.001)
|
||||
set_dropout_rate(model, 0.2)
|
||||
for e in range(3):
|
||||
Y, get_dX = model.begin_update(_get_docs())
|
||||
dY = get_gradient(model, Y)
|
||||
_ = get_dX(dY)
|
||||
model.finish_update(optimizer)
|
||||
if print_settings.get("print_after_training"):
|
||||
msg.info(f"After training:")
|
||||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 3: the final prediction
|
||||
prediction = model.predict(_get_docs())
|
||||
if print_settings.get("print_prediction"):
|
||||
msg.info(f"Prediction:", str(prediction))
|
||||
|
||||
|
||||
def get_gradient(model, Y):
|
||||
goldY = _get_output(model.ops.xp)
|
||||
return Y - goldY
|
||||
|
||||
|
||||
def _sentences():
|
||||
return [
|
||||
"Apple is looking at buying U.K. startup for $1 billion",
|
||||
"Autonomous cars shift insurance liability toward manufacturers",
|
||||
"San Francisco considers banning sidewalk delivery robots",
|
||||
"London is a big city in the United Kingdom.",
|
||||
]
|
||||
|
||||
|
||||
def _get_docs():
|
||||
nlp = English()
|
||||
return list(nlp.pipe(_sentences()))
|
||||
|
||||
|
||||
def _get_output(xp):
|
||||
return xp.asarray([xp.asarray([i+10, i+20, i+30], dtype="float32") for i, _ in enumerate(_get_docs())])
|
||||
|
||||
|
||||
def _print_model(model, print_settings):
|
||||
layers = print_settings.get("layers", "")
|
||||
parameters = print_settings.get("parameters", False)
|
||||
dimensions = print_settings.get("dimensions", False)
|
||||
gradients = print_settings.get("gradients", False)
|
||||
attributes = print_settings.get("attributes", False)
|
||||
|
||||
for i, node in enumerate(model.walk()):
|
||||
if not layers or i in layers:
|
||||
msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
|
||||
|
||||
if dimensions:
|
||||
for name in node.dim_names:
|
||||
if node.has_dim(name):
|
||||
msg.info(f" - dim {name}: {node.get_dim(name)}")
|
||||
else:
|
||||
msg.info(f" - dim {name}: {node.has_dim(name)}")
|
||||
|
||||
if parameters:
|
||||
for name in node.param_names:
|
||||
if node.has_param(name):
|
||||
print_value = _print_matrix(node.get_param(name))
|
||||
msg.info(f" - param {name}: {print_value}")
|
||||
else:
|
||||
msg.info(f" - param {name}: {node.has_param(name)}")
|
||||
if gradients:
|
||||
for name in node.param_names:
|
||||
if node.has_grad(name):
|
||||
print_value = _print_matrix(node.get_grad(name))
|
||||
msg.info(f" - grad {name}: {print_value}")
|
||||
else:
|
||||
msg.info(f" - grad {name}: {node.has_grad(name)}")
|
||||
if attributes:
|
||||
attrs = node.attrs
|
||||
for name, value in attrs.items():
|
||||
msg.info(f" - attr {name}: {value}")
|
||||
|
||||
|
||||
def _print_matrix(value):
|
||||
if value is None or isinstance(value, bool):
|
||||
return value
|
||||
result = str(value.shape) + " - sample: "
|
||||
sample_matrix = value
|
||||
for d in range(value.ndim-1):
|
||||
sample_matrix = sample_matrix[0]
|
||||
sample_matrix = sample_matrix[0:5]
|
||||
result = result + str(sample_matrix)
|
||||
return result
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Sequence, Union
|
||||
from typing import Optional, Sequence
|
||||
import requests
|
||||
import sys
|
||||
from wasabi import msg
|
||||
|
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
|
|||
from .. import about
|
||||
from ..util import is_package, get_base_version, run_command
|
||||
|
||||
# These are the old shortcuts we previously supported in spacy download. As of
|
||||
# v3, shortcuts are deprecated so we're not expecting to add anything to this
|
||||
# list. It only exists to show users warnings.
|
||||
OLD_SHORTCUTS = {
|
||||
"en": "en_core_web_sm",
|
||||
"de": "de_core_news_sm",
|
||||
"es": "es_core_news_sm",
|
||||
"pt": "pt_core_news_sm",
|
||||
"fr": "fr_core_news_sm",
|
||||
"it": "it_core_news_sm",
|
||||
"nl": "nl_core_news_sm",
|
||||
"el": "el_core_news_sm",
|
||||
"nb": "nb_core_news_sm",
|
||||
"lt": "lt_core_news_sm",
|
||||
"xx": "xx_ent_wiki_sm",
|
||||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
"download",
|
||||
|
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
|||
version = components[-1]
|
||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
else:
|
||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||
model_name = shortcuts.get(model, model)
|
||||
model_name = model
|
||||
if model in OLD_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
|
||||
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
|
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
|||
)
|
||||
|
||||
|
||||
def get_json(url: str, desc: str) -> Union[dict, list]:
|
||||
r = requests.get(url)
|
||||
def get_compatibility() -> dict:
|
||||
version = get_base_version(about.__version__)
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
f"Server error ({r.status_code})",
|
||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||
f"installation (v{about.__version__}), and download it manually. "
|
||||
f"For more details, see the documentation: "
|
||||
f"https://spacy.io/usage/models",
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_compatibility() -> dict:
|
||||
version = get_base_version(about.__version__)
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp_table = r.json()
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||
|
|
|
@ -1,708 +0,0 @@
|
|||
from typing import List, Dict, Any, Optional, Sequence
|
||||
import typer
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import requests
|
||||
import tqdm
|
||||
|
||||
from ._app import app, Arg, Opt, COMMAND, NAME
|
||||
from .. import about
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
||||
from ..util import get_hash, get_checksum, split_command
|
||||
|
||||
|
||||
CONFIG_FILE = "project.yml"
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
DIRS = [
|
||||
"assets",
|
||||
"metas",
|
||||
"configs",
|
||||
"packages",
|
||||
"metrics",
|
||||
"scripts",
|
||||
"notebooks",
|
||||
"training",
|
||||
"corpus",
|
||||
]
|
||||
CACHES = [
|
||||
Path.home() / ".torch",
|
||||
Path.home() / ".caches" / "torch",
|
||||
os.environ.get("TORCH_HOME"),
|
||||
Path.home() / ".keras",
|
||||
]
|
||||
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
|
||||
# it directly and edit the project.yml instead and re-run the project."""
|
||||
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
|
||||
templates. You'd typically start by cloning a project template to a local
|
||||
directory and fetching its assets like datasets etc. See the project's
|
||||
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
|
||||
Version Control) to manage input and output files and to ensure steps are only
|
||||
re-run if their inputs change.
|
||||
"""
|
||||
|
||||
project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
|
||||
|
||||
|
||||
@project_cli.callback(invoke_without_command=True)
|
||||
def callback(ctx: typer.Context):
|
||||
"""This runs before every project command and ensures DVC is installed."""
|
||||
ensure_dvc()
|
||||
|
||||
|
||||
################
|
||||
# CLI COMMANDS #
|
||||
################
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to fetch"),
|
||||
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo). Setting the --git flag will also
|
||||
initialize the project directory as a Git repo. If the project is intended
|
||||
to be a Git repo, it should be initialized with Git first, before
|
||||
initializing DVC (Data Version Control). This allows DVC to integrate with
|
||||
Git.
|
||||
"""
|
||||
if dest == Path.cwd():
|
||||
dest = dest / name
|
||||
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
|
||||
|
||||
|
||||
@project_cli.command("init")
|
||||
def project_init_cli(
|
||||
# fmt: off
|
||||
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Initialize a project directory with DVC and optionally Git. This should
|
||||
typically be taken care of automatically when you run the "project clone"
|
||||
command, but you can also run it separately. If the project is intended to
|
||||
be a Git repo, it should be initialized with Git first, before initializing
|
||||
DVC. This allows DVC to integrate with Git.
|
||||
"""
|
||||
project_init(path, git=git, force=force, silent=True)
|
||||
|
||||
|
||||
@project_cli.command("assets")
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Use DVC (Data Version Control) to fetch project assets. Assets are
|
||||
defined in the "assets" section of the project config. If possible, DVC
|
||||
will try to track the files so you can pull changes from upstream. It will
|
||||
also try and store the checksum so the assets are versioned. If the file
|
||||
can't be tracked or checked, it will be downloaded without DVC. If a checksum
|
||||
is provided in the project config, the file is only downloaded if no local
|
||||
file with the same checksum exists.
|
||||
"""
|
||||
project_assets(project_dir)
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run-all",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_run_all_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run all commands defined in the project. This command will use DVC and
|
||||
the defined outputs and dependencies in the project config to determine
|
||||
which steps need to be re-run and where to start. This means you're only
|
||||
re-generating data if the inputs have changed.
|
||||
|
||||
This command calls into "dvc repro" and all additional arguments are passed
|
||||
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
|
||||
"""
|
||||
if show_help:
|
||||
print_run_help(project_dir)
|
||||
else:
|
||||
project_run_all(project_dir, *ctx.args)
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named script defined in the project config. If the command is
|
||||
part of the default pipeline defined in the "run" section, DVC is used to
|
||||
determine whether the step should re-run if its inputs have changed, or
|
||||
whether everything is up to date. If the script is not part of the default
|
||||
pipeline, it will be called separately without DVC.
|
||||
|
||||
If DVC is used, the command calls into "dvc repro" and all additional
|
||||
arguments are passed to the "dvc repro" command:
|
||||
https://dvc.org/doc/command-reference/repro
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
project_run(project_dir, subcommand, *ctx.args)
|
||||
|
||||
|
||||
@project_cli.command("exec", hidden=True)
|
||||
def project_exec_cli(
|
||||
# fmt: off
|
||||
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Execute a command defined in the project config. This CLI command is
|
||||
only called internally in auto-generated DVC pipelines, as a shortcut for
|
||||
multi-step commands in the project config. You typically shouldn't have to
|
||||
call it yourself. To run a command, call "run" or "run-all".
|
||||
"""
|
||||
project_exec(project_dir, subcommand)
|
||||
|
||||
|
||||
@project_cli.command("update-dvc")
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
||||
"run" section of the project config. This typically happens automatically
|
||||
when running a command, but can also be triggered manually if needed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
||||
else:
|
||||
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
||||
|
||||
|
||||
app.add_typer(project_cli, name="project")
|
||||
|
||||
|
||||
#################
|
||||
# CLI FUNCTIONS #
|
||||
#################
|
||||
|
||||
|
||||
def project_clone(
|
||||
name: str,
|
||||
dest: Path,
|
||||
*,
|
||||
repo: str = about.__projects__,
|
||||
git: bool = False,
|
||||
no_init: bool = False,
|
||||
) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
git (bool): Initialize project as Git repo. Should be set to True if project
|
||||
is intended as a repo, since it will allow DVC to integrate with Git.
|
||||
no_init (bool): Don't initialize DVC and Git automatically. If True, the
|
||||
"init" command or "git init" and "dvc init" need to be run manually.
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||
try:
|
||||
run_command(cmd)
|
||||
except SystemExit:
|
||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||
msg.fail(err)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(name)
|
||||
try:
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
except SystemExit:
|
||||
err = f"Could not clone '{name}' in the repo '{repo}'."
|
||||
msg.fail(err)
|
||||
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
||||
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
|
||||
for sub_dir in DIRS:
|
||||
dir_path = project_dir / sub_dir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if not no_init:
|
||||
project_init(project_dir, git=git, force=True, silent=True)
|
||||
msg.good(f"Your project is now ready!", dest)
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def project_init(
|
||||
project_dir: Path,
|
||||
*,
|
||||
git: bool = False,
|
||||
force: bool = False,
|
||||
silent: bool = False,
|
||||
analytics: bool = False,
|
||||
):
|
||||
"""Initialize a project as a DVC and (optionally) as a Git repo.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
git (bool): Also call "git init" to initialize directory as a Git repo.
|
||||
silent (bool): Don't print any output (via DVC).
|
||||
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
||||
"""
|
||||
with working_dir(project_dir) as cwd:
|
||||
if git:
|
||||
run_command(["git", "init"])
|
||||
init_cmd = ["dvc", "init"]
|
||||
if silent:
|
||||
init_cmd.append("--quiet")
|
||||
if not git:
|
||||
init_cmd.append("--no-scm")
|
||||
if force:
|
||||
init_cmd.append("--force")
|
||||
run_command(init_cmd)
|
||||
# We don't want to have analytics on by default – our users should
|
||||
# opt-in explicitly. If they want it, they can always enable it.
|
||||
if not analytics:
|
||||
run_command(["dvc", "config", "core.analytics", "false"])
|
||||
# Remove unused and confusing plot templates from .dvc directory
|
||||
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
|
||||
# once you commit your changes via Git and it creates a bunch of files
|
||||
# that have no purpose
|
||||
plots_dir = cwd / DVC_DIR / "plots"
|
||||
if plots_dir.exists():
|
||||
shutil.rmtree(str(plots_dir))
|
||||
config = load_project_config(cwd)
|
||||
setup_check_dvc(cwd, config)
|
||||
|
||||
|
||||
def project_assets(project_dir: Path) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path)
|
||||
setup_check_dvc(project_path, config)
|
||||
assets = config.get("assets", {})
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
variables = config.get("variables", {})
|
||||
fetched_assets = []
|
||||
for asset in assets:
|
||||
url = asset["url"].format(**variables)
|
||||
dest = asset["dest"].format(**variables)
|
||||
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
|
||||
if fetched_path:
|
||||
fetched_assets.append(str(fetched_path))
|
||||
if fetched_assets:
|
||||
with working_dir(project_path):
|
||||
run_command(["dvc", "add", *fetched_assets, "--external"])
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> Optional[Path]:
|
||||
"""Fetch an asset from a given URL or path. Will try to import the file
|
||||
using DVC's import-url if possible (fully tracked and versioned) and falls
|
||||
back to get-url (versioned) and a non-DVC download if necessary. If a
|
||||
checksum is provided and a local file exists, it's only re-downloaded if the
|
||||
checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
url = convert_asset_url(url)
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists() and checksum:
|
||||
# If there's already a file, check for checksum
|
||||
# TODO: add support for caches (dvc import-url with local path)
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return dest_path
|
||||
with working_dir(project_path):
|
||||
try:
|
||||
# If these fail, we don't want to output an error or info message.
|
||||
# Try with tracking the source first, then just downloading with
|
||||
# DVC, then a regular non-DVC download.
|
||||
try:
|
||||
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||
except subprocess.CalledProcessError:
|
||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||
except subprocess.CalledProcessError:
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
return None
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
||||
msg.good(f"Fetched asset {dest}")
|
||||
return dest_path
|
||||
|
||||
|
||||
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||
"""Run all commands defined in the project using DVC.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
*dvc_args: Other arguments passed to "dvc repro".
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
setup_check_dvc(project_dir, config)
|
||||
dvc_cmd = ["dvc", "repro", *dvc_args]
|
||||
with working_dir(project_dir):
|
||||
run_command(dvc_cmd)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project config.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
setup_check_dvc(project_dir, config)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
if subcommand:
|
||||
validate_subcommand(commands.keys(), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
msg.text(f"\n{help_text}\n")
|
||||
else:
|
||||
print(f"\nAvailable commands in {CONFIG_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
msg.text("Run all commands defined in the 'run' block of the project config:")
|
||||
print(f"{COMMAND} project run-all {project_dir}")
|
||||
|
||||
|
||||
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||
"""Run a named script defined in the project config. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
*dvc_args: Other arguments passed to "dvc repro".
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
setup_check_dvc(project_dir, config)
|
||||
config_commands = config.get("commands", [])
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
validate_subcommand(commands.keys(), subcommand)
|
||||
if subcommand in config.get("run", []):
|
||||
# This is one of the pipeline commands tracked in DVC
|
||||
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
||||
with working_dir(project_dir):
|
||||
run_command(dvc_cmd)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
# Deps in non-DVC commands aren't tracked, but if they're defined,
|
||||
# make sure they exist before running the command
|
||||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
msg.fail(err, exits=1)
|
||||
with working_dir(project_dir):
|
||||
run_commands(cmd["script"], variables)
|
||||
|
||||
|
||||
def project_exec(project_dir: Path, subcommand: str):
|
||||
"""Execute a command defined in the project config.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
with working_dir(project_dir):
|
||||
run_commands(commands[subcommand]["script"], variables)
|
||||
|
||||
|
||||
###########
|
||||
# HELPERS #
|
||||
###########
|
||||
|
||||
|
||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||
"""Load the project config file from a directory and validate it.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
RETURNS (Dict[str, Any]): The loaded project config.
|
||||
"""
|
||||
config_path = path / CONFIG_FILE
|
||||
if not config_path.exists():
|
||||
msg.fail("Can't find project config", config_path, exits=1)
|
||||
invalid_err = f"Invalid project config in {CONFIG_FILE}"
|
||||
try:
|
||||
config = srsly.read_yaml(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(invalid_err, e, exits=1)
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
if errors:
|
||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||
return config
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project config.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project config, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
variables = config.get("variables", {})
|
||||
commands = []
|
||||
# We only want to include commands that are part of the main list of "run"
|
||||
# commands in project.yml and should be run in sequence
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in config.get("run", []):
|
||||
validate_subcommand(config_commands.keys(), name)
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "exec", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
if verbose:
|
||||
dvc_cmd.append("--verbose")
|
||||
if silent:
|
||||
dvc_cmd.append("--quiet")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
commands.append(" ".join(full_cmd))
|
||||
with working_dir(path):
|
||||
run_commands(commands, variables, silent=True)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def ensure_dvc() -> None:
|
||||
"""Ensure that the "dvc" command is available and show an error if not."""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
||||
"""Check that the project is set up correctly with DVC and update its
|
||||
config if needed. Will raise an error if the project is not an initialized
|
||||
DVC project.
|
||||
|
||||
project_dir (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project config.
|
||||
"""
|
||||
if not project_dir.exists():
|
||||
msg.fail(f"Can't find project directory: {project_dir}")
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project.",
|
||||
f"Make sure that the project template was cloned correctly. To "
|
||||
f"initialize the project directory manually, you can run: "
|
||||
f"{COMMAND} project init {project_dir}",
|
||||
exits=1,
|
||||
)
|
||||
with msg.loading("Updating DVC config..."):
|
||||
updated = update_dvc_config(project_dir, config, silent=True)
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
silent (bool): Don't print the commands.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {' '.join(command)}")
|
||||
run_command(command)
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match("(http(s?)):\/\/github.com", url):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually and then run:",
|
||||
f"{COMMAND} project init {dest}",
|
||||
exits=1,
|
||||
)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if subcommand not in commands:
|
||||
msg.fail(
|
||||
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
|
||||
f"Available commands: {', '.join(commands)}",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||
"""Download a file using requests.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
chunk_size (int): The size of chunks to read/write.
|
||||
"""
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total = int(response.headers.get("content-length", 0))
|
||||
progress_settings = {
|
||||
"total": total,
|
||||
"unit": "iB",
|
||||
"unit_scale": True,
|
||||
"unit_divisor": chunk_size,
|
||||
"leave": False,
|
||||
}
|
||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||
for data in response.iter_content(chunk_size=chunk_size):
|
||||
size = f.write(data)
|
||||
bar.update(size)
|
0
spacy/cli/project/__init__.py
Normal file
0
spacy/cli/project/__init__.py
Normal file
158
spacy/cli/project/assets.py
Normal file
158
spacy/cli/project/assets.py
Normal file
|
@ -0,0 +1,158 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import requests
|
||||
import tqdm
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._app import project_cli, Arg
|
||||
from .util import PROJECT_FILE, load_project_config, get_checksum
|
||||
|
||||
|
||||
# TODO: find a solution for caches
|
||||
# CACHES = [
|
||||
# Path.home() / ".torch",
|
||||
# Path.home() / ".caches" / "torch",
|
||||
# os.environ.get("TORCH_HOME"),
|
||||
# Path.home() / ".keras",
|
||||
# ]
|
||||
|
||||
|
||||
@project_cli.command("assets")
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
provided in the project.yml, the file is only downloaded if no local file
|
||||
with the same checksum exists.
|
||||
"""
|
||||
project_assets(project_dir)
|
||||
|
||||
|
||||
def project_assets(project_dir: Path) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path)
|
||||
assets = config.get("assets", {})
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
variables = config.get("variables", {})
|
||||
for asset in assets:
|
||||
dest = asset["dest"].format(**variables)
|
||||
url = asset.get("url")
|
||||
checksum = asset.get("checksum")
|
||||
if not url:
|
||||
# project.yml defines asset without URL that the user has to place
|
||||
check_private_asset(dest, checksum)
|
||||
continue
|
||||
url = url.format(**variables)
|
||||
fetch_asset(project_path, url, dest, checksum)
|
||||
|
||||
|
||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||
"""Check and validate assets without a URL (private assets that the user
|
||||
has to provide themselves) and give feedback about the checksum.
|
||||
|
||||
dest (Path): Desintation path of the asset.
|
||||
checksum (Optional[str]): Optional checksum of the expected file.
|
||||
"""
|
||||
if not Path(dest).exists():
|
||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||
msg.warn(err)
|
||||
else:
|
||||
if checksum and checksum == get_checksum(dest):
|
||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||
else:
|
||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> None:
|
||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
# TODO: add support for caches
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists() and checksum:
|
||||
# If there's already a file, check for checksum
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return dest_path
|
||||
# We might as well support the user here and create parent directories in
|
||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||
if not dest_path.parent.exists():
|
||||
dest_path.parent.mkdir(parents=True)
|
||||
with working_dir(project_path):
|
||||
url = convert_asset_url(url)
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
if Path(url).exists() and Path(url).is_file():
|
||||
# If it's a local file, copy to destination
|
||||
shutil.copy(url, str(dest_path))
|
||||
msg.good(f"Copied local asset {dest}")
|
||||
else:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
return
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||
"""Download a file using requests.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
chunk_size (int): The size of chunks to read/write.
|
||||
"""
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total = int(response.headers.get("content-length", 0))
|
||||
progress_settings = {
|
||||
"total": total,
|
||||
"unit": "iB",
|
||||
"unit_scale": True,
|
||||
"unit_divisor": chunk_size,
|
||||
"leave": False,
|
||||
}
|
||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||
for data in response.iter_content(chunk_size=chunk_size):
|
||||
size = f.write(data)
|
||||
bar.update(size)
|
97
spacy/cli/project/clone.py
Normal file
97
spacy/cli/project/clone.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import shutil
|
||||
import re
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path, run_command, make_tempdir
|
||||
from .._app import project_cli, Arg, Opt, COMMAND
|
||||
from .util import PROJECT_FILE
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo).
|
||||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / name
|
||||
project_clone(name, dest, repo=repo)
|
||||
|
||||
|
||||
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||
try:
|
||||
run_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||
msg.fail(err)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(name)
|
||||
try:
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
msg.fail(err)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(name)), str(project_dir))
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
else:
|
||||
msg.good(f"Your project is now ready!")
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually and then run:",
|
||||
f"{COMMAND} project init {dest}",
|
||||
exits=1,
|
||||
)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
exits=1,
|
||||
)
|
208
spacy/cli/project/dvc.py
Normal file
208
spacy/cli/project/dvc.py
Normal file
|
@ -0,0 +1,208 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .util import PROJECT_FILE, load_project_config, get_hash
|
||||
from .._app import project_cli, Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
UPDATE_COMMAND = "dvc"
|
||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||
|
||||
|
||||
@project_cli.command(UPDATE_COMMAND)
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if the project.yml changed.
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
project_dir: Path,
|
||||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. Will only update the file if the checksum changed.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||
else:
|
||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
ensure_dvc(path)
|
||||
workflows = config.get("workflows", {})
|
||||
workflow_names = list(workflows.keys())
|
||||
check_workflows(workflow_names, workflow)
|
||||
if not workflow:
|
||||
workflow = workflow_names[0]
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
variables = config.get("variables", {})
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, str] = {},
|
||||
flags: Dict[str, bool] = {},
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
||||
workflows (List[str]): Names of the available workflows.
|
||||
workflow (Optional[str]): The name of the workflow to convert.
|
||||
"""
|
||||
if not workflows:
|
||||
msg.fail(
|
||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||
f"define at least one list of commands.",
|
||||
exits=1,
|
||||
)
|
||||
if workflow is not None and workflow not in workflows:
|
||||
msg.fail(
|
||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||
f"Available workflows: {', '.join(workflows)}",
|
||||
exits=1,
|
||||
)
|
||||
if not workflow:
|
||||
msg.warn(
|
||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||
)
|
||||
|
||||
|
||||
def ensure_dvc(project_dir: Path) -> None:
|
||||
"""Ensure that the "dvc" command is available and that the current project
|
||||
directory is an initialized DVC project.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||
"to be installed and the 'dvc' command needs to be available",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project",
|
||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||
"directory. For more details, see the documentation: "
|
||||
"https://dvc.org/doc/command-reference/init",
|
||||
exits=1,
|
||||
)
|
266
spacy/cli/project/run.py
Normal file
266
spacy/cli/project/run.py
Normal file
|
@ -0,0 +1,266 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import sys
|
||||
import srsly
|
||||
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from .._app import project_cli, Arg, Opt, COMMAND
|
||||
from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .util import get_checksum
|
||||
|
||||
|
||||
@project_cli.command("run")
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
||||
name is specified, all commands in the workflow are run, in order. If
|
||||
commands define dependencies and/or outputs, they will only be re-run if
|
||||
state has changed.
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
project_run(project_dir, subcommand, force=force, dry=dry)
|
||||
|
||||
|
||||
def project_run(
|
||||
project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
force (bool): Force re-running, even if nothing changed.
|
||||
dry (bool): Perform a dry run and don't execute commands.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for cmd in workflows[subcommand]:
|
||||
project_run(project_dir, cmd, force=force, dry=dry)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
variables = config.get("variables", {})
|
||||
for dep in cmd.get("deps", []):
|
||||
dep = dep.format(**variables)
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
rerun = check_rerun(current_dir, cmd, variables)
|
||||
if not rerun and not force:
|
||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||
else:
|
||||
msg.divider(subcommand)
|
||||
run_commands(cmd["script"], variables, dry=dry)
|
||||
if not dry:
|
||||
update_lockfile(current_dir, cmd, variables)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
workflows = config.get("workflows", {})
|
||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||
if subcommand:
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||
if subcommand in commands:
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
print(f"\n{help_text}\n")
|
||||
elif subcommand in workflows:
|
||||
steps = workflows[subcommand]
|
||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
||||
steps_data = [
|
||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
||||
for i, step in enumerate(steps)
|
||||
]
|
||||
msg.table(steps_data)
|
||||
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
||||
print(f"For command details, run: {help_cmd}")
|
||||
else:
|
||||
print("")
|
||||
if config_commands:
|
||||
print(f"Available commands in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
if workflows:
|
||||
print(f"Available workflows in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
||||
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, Any] = {},
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
silent (bool): Don't print the commands.
|
||||
dry (bool): Perform a dry run and don't execut anything.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||
) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if not commands and not workflows:
|
||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||
if subcommand not in commands and subcommand not in workflows:
|
||||
help_msg = []
|
||||
if commands:
|
||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||
if workflows:
|
||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||
msg.fail(
|
||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||
". ".join(help_msg),
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def check_rerun(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> bool:
|
||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||
changed.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (bool): Whether to re-run the command.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||
return True
|
||||
data = srsly.read_yaml(lock_path)
|
||||
if command["name"] not in data: # We don't have info about this command
|
||||
return True
|
||||
entry = data[command["name"]]
|
||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||
if not entry.get("outs", []):
|
||||
return True
|
||||
# If the entry in the lockfile matches the lockfile entry that would be
|
||||
# generated from the current command, we don't rerun because it means that
|
||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
||||
|
||||
|
||||
def update_lockfile(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> None:
|
||||
"""Update the lockfile after running a command. Will create a lockfile if
|
||||
it doesn't yet exist and will add an entry for the current command, its
|
||||
script and dependencies/outputs.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists():
|
||||
srsly.write_yaml(lock_path, {})
|
||||
data = {}
|
||||
else:
|
||||
data = srsly.read_yaml(lock_path)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
||||
srsly.write_yaml(lock_path, data)
|
||||
|
||||
|
||||
def get_lock_entry(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||
the script (command steps) and a list of dependencies and outputs with
|
||||
their paths and file hashes, if available. The format is based on the
|
||||
dvc.lock files, to keep things consistent.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||
"""
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
||||
return {
|
||||
"cmd": f"{COMMAND} run {command['name']}",
|
||||
"script": command["script"],
|
||||
"deps": deps,
|
||||
"outs": [*outs, *outs_nc],
|
||||
}
|
||||
|
||||
|
||||
def get_fileinfo(
|
||||
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
||||
) -> List[Dict[str, str]]:
|
||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||
Includes the file path and the file's checksum.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
paths (List[str]): The file paths.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||
"""
|
||||
data = []
|
||||
for path in paths:
|
||||
path = path.format(**variables)
|
||||
file_path = project_dir / path
|
||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
return data
|
93
spacy/cli/project/util.py
Normal file
93
spacy/cli/project/util.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
from typing import Dict, Any, Union
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
import hashlib
|
||||
|
||||
from ...schemas import ProjectConfigSchema, validate
|
||||
|
||||
|
||||
PROJECT_FILE = "project.yml"
|
||||
PROJECT_LOCK = "project.lock"
|
||||
|
||||
|
||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||
"""Load the project.yml file from a directory and validate it. Also make
|
||||
sure that all directories defined in the config exist.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||
"""
|
||||
config_path = path / PROJECT_FILE
|
||||
if not config_path.exists():
|
||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||
try:
|
||||
config = srsly.read_yaml(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(invalid_err, e, exits=1)
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
if errors:
|
||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||
validate_project_commands(config)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
return config
|
||||
|
||||
|
||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||
"""Check that project commands and workflows are valid, don't contain
|
||||
duplicates, don't clash and only refer to commands that exist.
|
||||
|
||||
config (Dict[str, Any]): The loaded config.
|
||||
"""
|
||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||
workflows = config.get("workflows", {})
|
||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||
if duplicates:
|
||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||
msg.fail(err, exits=1)
|
||||
for workflow_name, workflow_steps in workflows.items():
|
||||
if workflow_name in command_names:
|
||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||
msg.fail(err, exits=1)
|
||||
for step in workflow_steps:
|
||||
if step not in command_names:
|
||||
msg.fail(
|
||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||
f"Workflows can only refer to commands defined in the 'commands' "
|
||||
f"section of the {PROJECT_FILE}.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def get_hash(data) -> str:
|
||||
"""Get the hash for a JSON-serializable object.
|
||||
|
||||
data: The data to hash.
|
||||
RETURNS (str): The hash.
|
||||
"""
|
||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||
return hashlib.md5(data_str).hexdigest()
|
||||
|
||||
|
||||
def get_checksum(path: Union[Path, str]) -> str:
|
||||
"""Get the checksum for a file or directory given its file path. If a
|
||||
directory path is provided, this uses all files in that directory.
|
||||
|
||||
path (Union[Path, str]): The file or directory path.
|
||||
RETURNS (str): The checksum.
|
||||
"""
|
||||
path = Path(path)
|
||||
if path.is_file():
|
||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||
if path.is_dir():
|
||||
# TODO: this is currently pretty slow
|
||||
dir_checksum = hashlib.md5()
|
||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||
dir_checksum.update(sub_file.read_bytes())
|
||||
return dir_checksum.hexdigest()
|
||||
raise ValueError(f"Can't get checksum for {path}: not a file or directory")
|
|
@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
|
|||
@app.command("train")
|
||||
def train_cli(
|
||||
# fmt: off
|
||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
||||
train_path: Path = Arg(..., help="Location of training data", exists=True),
|
||||
dev_path: Path = Arg(..., help="Location of development data", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
|
||||
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
|
||||
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||
num_workers: int = Opt(None, "-j", help="Parallel Workers"),
|
||||
strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"),
|
||||
|
@ -155,6 +155,7 @@ def train_cli(
|
|||
if init_tok2vec is not None:
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
|
||||
train_args = dict(
|
||||
config_path=config_path,
|
||||
data_paths={"train": train_path, "dev": dev_path},
|
||||
|
@ -170,7 +171,7 @@ def train_cli(
|
|||
distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args)
|
||||
else:
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {str(use_gpu)}")
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
@ -191,7 +192,8 @@ def train(
|
|||
msg.info(f"Loading config from: {config_path}")
|
||||
# Read the config first without creating objects, to get to the original nlp_config
|
||||
config = util.load_config(config_path, create_objects=False)
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
if config["training"].get("seed"):
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
|
@ -216,7 +218,10 @@ def train(
|
|||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp, shuffle=False, gold_preproc=training["gold_preproc"]
|
||||
nlp,
|
||||
shuffle=False,
|
||||
gold_preproc=training["gold_preproc"],
|
||||
max_length=training["max_length"],
|
||||
)
|
||||
)
|
||||
nlp.begin_training(lambda: train_examples)
|
||||
|
@ -315,6 +320,7 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
|
|||
)
|
||||
|
||||
epoch = 0
|
||||
batch_strategy = cfg.get("batch_by", "sequences")
|
||||
while True:
|
||||
if len(train_examples) == 0:
|
||||
raise ValueError(Errors.E988)
|
||||
|
@ -324,11 +330,22 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
|
|||
random.random()
|
||||
random.shuffle(train_examples)
|
||||
epoch += 1
|
||||
batches = util.minibatch_by_words(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
discard_oversize=cfg["discard_oversize"],
|
||||
)
|
||||
if batch_strategy == "padded":
|
||||
batches = util.minibatch_by_padded_size(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
buffer=256,
|
||||
discard_oversize=cfg["discard_oversize"],
|
||||
)
|
||||
elif batch_strategy == "words":
|
||||
batches = util.minibatch_by_words(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
discard_oversize=cfg["discard_oversize"],
|
||||
)
|
||||
else:
|
||||
batches = util.minibatch(train_examples, size=cfg["batch_size"])
|
||||
|
||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||
try:
|
||||
first = next(batches)
|
||||
|
@ -440,7 +457,9 @@ def train_while_improving(
|
|||
|
||||
if raw_text:
|
||||
random.shuffle(raw_text)
|
||||
raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text]
|
||||
raw_examples = [
|
||||
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
||||
]
|
||||
raw_batches = util.minibatch(raw_examples, size=8)
|
||||
|
||||
for step, (epoch, batch) in enumerate(train_data):
|
||||
|
|
|
@ -69,6 +69,9 @@ class Warnings(object):
|
|||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||
"be more efficient to split your training data into multiple "
|
||||
"smaller JSON files instead.")
|
||||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||
"but is expecting one of type 'uint64' instead. This may result "
|
||||
"in problems with the vocab further on in the pipeline.")
|
||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||
"entities \"{entities}\". Use "
|
||||
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||
|
@ -477,15 +480,14 @@ class Errors(object):
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||
"array and {doc_length} for the Doc itself.")
|
||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||
E973 = ("Unexpected type for NER data")
|
||||
E974 = ("Unknown {obj} attribute: {key}")
|
||||
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
|
||||
"but got {type}")
|
||||
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
|
||||
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||
"but received None.")
|
||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
|
|
|
@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
|||
|
||||
cdef class Example:
|
||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||
""" Doc can either be text, or an actual Doc """
|
||||
if predicted is None:
|
||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||
if reference is None:
|
||||
|
@ -37,6 +36,9 @@ cdef class Example:
|
|||
self.y = reference
|
||||
self._alignment = alignment
|
||||
|
||||
def __len__(self):
|
||||
return len(self.predicted)
|
||||
|
||||
property predicted:
|
||||
def __get__(self):
|
||||
return self.x
|
||||
|
@ -59,17 +61,15 @@ cdef class Example:
|
|||
|
||||
@classmethod
|
||||
def from_dict(cls, Doc predicted, dict example_dict):
|
||||
if predicted is None:
|
||||
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||
if example_dict is None:
|
||||
raise ValueError(Errors.E976)
|
||||
if not isinstance(predicted, Doc):
|
||||
raise TypeError(Errors.E975.format(type=type(predicted)))
|
||||
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||
example_dict = _fix_legacy_dict_data(example_dict)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
if "ORTH" not in tok_dict:
|
||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||
if not _has_field(tok_dict, "SPACY"):
|
||||
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
|
||||
return Example(
|
||||
predicted,
|
||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||
|
@ -257,7 +257,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
values.append([vocab.morphology.add(v) for v in value])
|
||||
else:
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
try:
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
except TypeError:
|
||||
types= set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
return attrs, array.T
|
||||
|
@ -325,8 +329,8 @@ def _fix_legacy_dict_data(example_dict):
|
|||
for key, value in old_token_dict.items():
|
||||
if key in ("text", "ids", "brackets"):
|
||||
pass
|
||||
elif key in remapping:
|
||||
token_dict[remapping[key]] = value
|
||||
elif key.lower() in remapping:
|
||||
token_dict[remapping[key.lower()]] = value
|
||||
else:
|
||||
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
|
||||
text = example_dict.get("text", example_dict.get("raw"))
|
||||
|
|
|
@ -513,20 +513,23 @@ class Language(object):
|
|||
):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
examples (iterable): A batch of `Example` objects.
|
||||
examples (Iterable[Example]): A batch of examples
|
||||
dummy: Should not be set - serves to catch backwards-incompatible scripts.
|
||||
drop (float): The dropout rate.
|
||||
sgd (callable): An optimizer.
|
||||
losses (dict): Dictionary to update with the loss, keyed by component.
|
||||
component_cfg (dict): Config parameters for specific pipeline
|
||||
sgd (Optimizer): An optimizer.
|
||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
||||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||
components, keyed by component name.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||
|
||||
DOCS: https://spacy.io/api/language#update
|
||||
"""
|
||||
if dummy is not None:
|
||||
raise ValueError(Errors.E989)
|
||||
if losses is None:
|
||||
losses = {}
|
||||
if len(examples) == 0:
|
||||
return
|
||||
return losses
|
||||
if not isinstance(examples, Iterable):
|
||||
raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples)))
|
||||
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
||||
|
@ -540,22 +543,19 @@ class Language(object):
|
|||
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
component_deps = count_pipeline_interdependencies(self.pipeline)
|
||||
# Determine whether component should set annotations. In theory I guess
|
||||
# we should do this by inspecting the meta? Or we could just always
|
||||
# say "yes"
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
component_cfg.setdefault(name, {})
|
||||
component_cfg[name].setdefault("drop", drop)
|
||||
component_cfg[name]["set_annotations"] = bool(component_deps[i])
|
||||
component_cfg[name].setdefault("set_annotations", False)
|
||||
for name, proc in self.pipeline:
|
||||
if not hasattr(proc, "update"):
|
||||
continue
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||
if sgd is not False:
|
||||
if sgd not in (None, False):
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "model"):
|
||||
proc.model.finish_update(sgd)
|
||||
return losses
|
||||
|
||||
def rehearse(self, examples, sgd=None, losses=None, config=None):
|
||||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||||
|
@ -761,18 +761,17 @@ class Language(object):
|
|||
):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||
|
||||
texts (iterator): A sequence of texts to process.
|
||||
texts (Iterable[str]): A sequence of texts to process.
|
||||
as_tuples (bool): If set to True, inputs should be a sequence of
|
||||
(text, context) tuples. Output will then be a sequence of
|
||||
(doc, context) tuples. Defaults to False.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
disable (List[str]): Names of the pipeline components to disable.
|
||||
cleanup (bool): If True, unneeded strings are freed to control memory
|
||||
use. Experimental.
|
||||
component_cfg (dict): An optional dictionary with extra keyword
|
||||
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||||
arguments for specific components.
|
||||
n_process (int): Number of processors to process texts, only supported
|
||||
in Python3. If -1, set `multiprocessing.cpu_count()`.
|
||||
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
||||
DOCS: https://spacy.io/api/language#pipe
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from thinc.api import Model, normal_init
|
||||
|
||||
|
||||
def PrecomputableAffine(nO, nI, nF, nP):
|
||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||
model = Model(
|
||||
"precomputable_affine",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||
params={"W": None, "b": None, "pad": None},
|
||||
attrs={"dropout_rate": dropout}
|
||||
)
|
||||
return model
|
||||
|
||||
|
@ -48,17 +49,14 @@ def forward(model, X, is_train):
|
|||
model.inc_grad("b", dY.sum(axis=0))
|
||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||
|
||||
Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
|
||||
Wopfi = W.transpose((1, 2, 0, 3))
|
||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||
|
||||
# Reuse the buffer
|
||||
dWopfi = Wopfi
|
||||
dWopfi.fill(0.0)
|
||||
model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
|
||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||
model.inc_grad("W", dWopfi)
|
||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||
|
||||
|
|
|
@ -87,16 +87,16 @@ def build_text_classifier(
|
|||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
|
||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
||||
)
|
||||
prefix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
|
||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
|
||||
)
|
||||
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
|
|
|
@ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces):
|
|||
def MultiHashEmbed(
|
||||
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
|
||||
):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6)
|
||||
if use_subwords:
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
|
||||
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
|
||||
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
|
||||
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9
|
||||
)
|
||||
|
||||
if pretrained_vectors:
|
||||
|
@ -192,7 +192,7 @@ def MultiHashEmbed(
|
|||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5)
|
||||
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
|
||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||
embed_layer = chr_embed | features >> with_array(norm)
|
||||
|
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
|
|||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
norm = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
|
||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
|
||||
seed=0
|
||||
)
|
||||
if subword_features:
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
|
||||
seed=1
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
|
||||
seed=2
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
|
||||
seed=3
|
||||
)
|
||||
else:
|
||||
|
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
|
|||
>> Maxout(
|
||||
nO=width,
|
||||
nI=width * columns,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
|
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
|
|||
>> Maxout(
|
||||
nO=width,
|
||||
nI=width * columns,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
|
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
|
|||
>> Maxout(
|
||||
nO=width,
|
||||
nI=width * columns,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
|
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
|
|||
reduce_dimensions = Maxout(
|
||||
nO=width,
|
||||
nI=nM * nC + width,
|
||||
nP=maxout_pieces,
|
||||
nP=3,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
)
|
||||
|
|
|
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
|
|||
from ..syntax._parser_model import ParserStepModel
|
||||
|
||||
|
||||
def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
||||
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
||||
"""Set up a stepwise transition-based model"""
|
||||
if upper is None:
|
||||
has_upper = False
|
||||
|
|
|
@ -272,7 +272,7 @@ cdef class Morphology:
|
|||
|
||||
@staticmethod
|
||||
def feats_to_dict(feats):
|
||||
if not feats:
|
||||
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||
return {}
|
||||
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
||||
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
||||
|
|
|
@ -3,7 +3,7 @@ cimport numpy as np
|
|||
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import to_categorical
|
||||
from thinc.api import SequenceCategoricalCrossentropy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..vocab cimport Vocab
|
||||
|
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
|
|||
doc.is_morphed = True
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||
for i in range(len(morphs)):
|
||||
|
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
|
|||
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
||||
if morph == "":
|
||||
morph = Morphology.EMPTY_MORPH
|
||||
if morph is None:
|
||||
correct[idx] = guesses[idx]
|
||||
elif morph in tag_index:
|
||||
correct[idx] = tag_index[morph]
|
||||
else:
|
||||
correct[idx] = 0
|
||||
known_labels[idx] = 0.
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
d_scores *= self.model.ops.asarray(known_labels)
|
||||
loss = (d_scores**2).sum()
|
||||
docs = [eg.predicted for eg in examples]
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
eg_truths.append(morph)
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
|
|
|
@ -58,12 +58,8 @@ class Pipe(object):
|
|||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
predictions = self.predict([doc])
|
||||
if isinstance(predictions, tuple) and len(predictions) == 2:
|
||||
scores, tensors = predictions
|
||||
self.set_annotations([doc], scores, tensors=tensors)
|
||||
else:
|
||||
self.set_annotations([doc], predictions)
|
||||
scores = self.predict([doc])
|
||||
self.set_annotations([doc], scores)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128):
|
||||
|
@ -73,12 +69,8 @@ class Pipe(object):
|
|||
and `set_annotations()` methods.
|
||||
"""
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
predictions = self.predict(docs)
|
||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||
scores, tensors = predictions
|
||||
self.set_annotations(docs, scores, tensors=tensors)
|
||||
else:
|
||||
self.set_annotations(docs, predictions)
|
||||
scores = self.predict(docs)
|
||||
self.set_annotations(docs, scores)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -87,7 +79,7 @@ class Pipe(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
def set_annotations(self, docs, scores):
|
||||
"""Modify a batch of documents, using pre-computed scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -281,9 +273,10 @@ class Tagger(Pipe):
|
|||
idx += 1
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
||||
try:
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
|
@ -303,11 +296,11 @@ class Tagger(Pipe):
|
|||
if sgd not in (None, False):
|
||||
self.model.finish_update(sgd)
|
||||
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
||||
return losses
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
||||
|
@ -334,7 +327,7 @@ class Tagger(Pipe):
|
|||
losses[self.name] += (gradient**2).sum()
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
|
@ -521,29 +514,23 @@ class SentenceRecognizer(Tagger):
|
|||
doc.c[j].sent_start = -1
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = range(len(self.labels))
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||
labels = self.labels
|
||||
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
sent_starts = eg.get_aligned("sent_start")
|
||||
for sent_start in sent_starts:
|
||||
if sent_start is None:
|
||||
correct[idx] = guesses[idx]
|
||||
elif sent_start in tag_index:
|
||||
correct[idx] = sent_start
|
||||
eg_truth = []
|
||||
for x in eg.get_aligned("sent_start"):
|
||||
if x == None:
|
||||
eg_truth.append(None)
|
||||
elif x == 1:
|
||||
eg_truth.append(labels[1])
|
||||
else:
|
||||
correct[idx] = 0
|
||||
known_labels[idx] = 0.
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
d_scores *= self.model.ops.asarray(known_labels)
|
||||
loss = (d_scores**2).sum()
|
||||
docs = [eg.predicted for eg in examples]
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
# anything other than 1: 0, -1, -1 as uint64
|
||||
eg_truth.append(labels[0])
|
||||
truths.append(eg_truth)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||
|
@ -641,7 +628,7 @@ class MultitaskObjective(Tagger):
|
|||
def labels(self, value):
|
||||
self.cfg["labels"] = value
|
||||
|
||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||
|
@ -738,7 +725,7 @@ class ClozeMultitask(Pipe):
|
|||
self.cfg = cfg
|
||||
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
|
||||
|
||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||
|
@ -767,7 +754,7 @@ class ClozeMultitask(Pipe):
|
|||
loss = self.distance.get_loss(prediction, target)
|
||||
return loss, gradient
|
||||
|
||||
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
pass
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
|
@ -815,8 +802,8 @@ class TextCategorizer(Pipe):
|
|||
|
||||
def pipe(self, stream, batch_size=128):
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
scores, tensors = self.predict(docs)
|
||||
self.set_annotations(docs, scores, tensors=tensors)
|
||||
scores = self.predict(docs)
|
||||
self.set_annotations(docs, scores)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -826,22 +813,25 @@ class TextCategorizer(Pipe):
|
|||
# Handle cases where there are no tokens in any docs.
|
||||
xp = get_array_module(tensors)
|
||||
scores = xp.zeros((len(docs), len(self.labels)))
|
||||
return scores, tensors
|
||||
return scores
|
||||
|
||||
scores = self.model.predict(docs)
|
||||
scores = self.model.ops.asarray(scores)
|
||||
return scores, tensors
|
||||
return scores
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
def set_annotations(self, docs, scores):
|
||||
for i, doc in enumerate(docs):
|
||||
for j, label in enumerate(self.labels):
|
||||
doc.cats[label] = float(scores[i, j])
|
||||
|
||||
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
try:
|
||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
return
|
||||
return losses
|
||||
except AttributeError:
|
||||
types = set([type(eg) for eg in examples])
|
||||
raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
|
||||
|
@ -853,12 +843,11 @@ class TextCategorizer(Pipe):
|
|||
bp_scores(d_scores)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += loss
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
self.set_annotations(docs, scores=scores)
|
||||
return losses
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
if self._rehearsal_model is None:
|
||||
|
@ -1082,12 +1071,13 @@ class EntityLinker(Pipe):
|
|||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||
def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||
self.require_kb()
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
if not examples:
|
||||
return 0
|
||||
return losses
|
||||
sentence_docs = []
|
||||
try:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
|
@ -1130,20 +1120,19 @@ class EntityLinker(Pipe):
|
|||
return 0.0
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||
loss, d_scores = self.get_similarity_loss(
|
||||
scores=sentence_encodings,
|
||||
sentence_encodings=sentence_encodings,
|
||||
examples=examples
|
||||
)
|
||||
bp_context(d_scores)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
self.set_annotations(docs, predictions)
|
||||
return loss
|
||||
return losses
|
||||
|
||||
def get_similarity_loss(self, examples, scores):
|
||||
def get_similarity_loss(self, examples, sentence_encodings):
|
||||
entity_encodings = []
|
||||
for eg in examples:
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
|
@ -1155,41 +1144,23 @@ class EntityLinker(Pipe):
|
|||
|
||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||
|
||||
if scores.shape != entity_encodings.shape:
|
||||
if sentence_encodings.shape != entity_encodings.shape:
|
||||
raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
|
||||
|
||||
gradients = self.distance.get_grad(scores, entity_encodings)
|
||||
loss = self.distance.get_loss(scores, entity_encodings)
|
||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
|
||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings)
|
||||
loss = loss / len(entity_encodings)
|
||||
return loss, gradients
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
cats = []
|
||||
for eg in examples:
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
for ent in eg.predicted.ents:
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
cats.append([1.0])
|
||||
|
||||
cats = self.model.ops.asarray(cats, dtype="float32")
|
||||
if len(scores) != len(cats):
|
||||
raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
|
||||
|
||||
d_scores = (scores - cats)
|
||||
loss = (d_scores ** 2).sum()
|
||||
loss = loss / len(cats)
|
||||
return loss, d_scores
|
||||
|
||||
def __call__(self, doc):
|
||||
kb_ids, tensors = self.predict([doc])
|
||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
||||
kb_ids = self.predict([doc])
|
||||
self.set_annotations([doc], kb_ids)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128):
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
kb_ids, tensors = self.predict(docs)
|
||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||
kb_ids = self.predict(docs)
|
||||
self.set_annotations(docs, kb_ids)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -1197,10 +1168,9 @@ class EntityLinker(Pipe):
|
|||
self.require_kb()
|
||||
entity_count = 0
|
||||
final_kb_ids = []
|
||||
final_tensors = []
|
||||
|
||||
if not docs:
|
||||
return final_kb_ids, final_tensors
|
||||
return final_kb_ids
|
||||
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
@ -1234,21 +1204,18 @@ class EntityLinker(Pipe):
|
|||
if to_discard and ent.label_ in to_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
final_tensors.append(sentence_encoding)
|
||||
|
||||
else:
|
||||
candidates = self.kb.get_candidates(ent.text)
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
final_tensors.append(sentence_encoding)
|
||||
|
||||
elif len(candidates) == 1:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
|
||||
# TODO: thresholding
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
final_tensors.append(sentence_encoding)
|
||||
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
|
@ -1277,14 +1244,13 @@ class EntityLinker(Pipe):
|
|||
best_index = scores.argmax().item()
|
||||
best_candidate = candidates[best_index]
|
||||
final_kb_ids.append(best_candidate.entity_)
|
||||
final_tensors.append(sentence_encoding)
|
||||
|
||||
if not (len(final_tensors) == len(final_kb_ids) == entity_count):
|
||||
if not (len(final_kb_ids) == entity_count):
|
||||
raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
|
||||
|
||||
return final_kb_ids, final_tensors
|
||||
return final_kb_ids
|
||||
|
||||
def set_annotations(self, docs, kb_ids, tensors=None):
|
||||
def set_annotations(self, docs, kb_ids):
|
||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||
if count_ents != len(kb_ids):
|
||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||
|
@ -1400,11 +1366,7 @@ class Sentencizer(Pipe):
|
|||
def pipe(self, stream, batch_size=128):
|
||||
for docs in util.minibatch(stream, size=batch_size):
|
||||
predictions = self.predict(docs)
|
||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||
scores, tensors = predictions
|
||||
self.set_annotations(docs, scores, tensors=tensors)
|
||||
else:
|
||||
self.set_annotations(docs, predictions)
|
||||
self.set_annotations(docs, predictions)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -1435,7 +1397,7 @@ class Sentencizer(Pipe):
|
|||
guesses.append(doc_guesses)
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
|
|
|
@ -57,7 +57,7 @@ class SimpleNER(Pipe):
|
|||
scores = self.model.predict(docs)
|
||||
return scores
|
||||
|
||||
def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None):
|
||||
def set_annotations(self, docs: List[Doc], scores: List[Floats2d]):
|
||||
"""Set entities on a batch of documents from a batch of scores."""
|
||||
tag_names = self.get_tag_names()
|
||||
for i, doc in enumerate(docs):
|
||||
|
@ -67,9 +67,12 @@ class SimpleNER(Pipe):
|
|||
tags = iob_to_biluo(tags)
|
||||
doc.ents = spans_from_biluo_tags(doc, tags)
|
||||
|
||||
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||
def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault("ner", 0.0)
|
||||
if not any(_has_ner(eg) for eg in examples):
|
||||
return 0
|
||||
return losses
|
||||
docs = [eg.predicted for eg in examples]
|
||||
set_dropout_rate(self.model, drop)
|
||||
scores, bp_scores = self.model.begin_update(docs)
|
||||
|
@ -79,10 +82,8 @@ class SimpleNER(Pipe):
|
|||
self.set_annotations(docs, scores)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault("ner", 0.0)
|
||||
losses["ner"] += loss
|
||||
return loss
|
||||
losses["ner"] += loss
|
||||
return losses
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
loss = 0
|
||||
|
|
|
@ -83,12 +83,14 @@ class Tok2Vec(Pipe):
|
|||
assert tokvecs.shape[0] == len(doc)
|
||||
doc.tensor = tokvecs
|
||||
|
||||
def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False):
|
||||
def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False):
|
||||
"""Update the model.
|
||||
examples (iterable): A batch of examples
|
||||
examples (Iterable[Example]): A batch of examples
|
||||
drop (float): The droput rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
sgd (Optimizer): An optimizer.
|
||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
||||
set_annotations (bool): whether or not to update the examples with the predictions
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
|
@ -124,6 +126,7 @@ class Tok2Vec(Pipe):
|
|||
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
||||
if set_annotations:
|
||||
self.set_annotations(docs, tokvecs)
|
||||
return losses
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
pass
|
||||
|
|
|
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
|
|||
class ProjectConfigAsset(BaseModel):
|
||||
# fmt: off
|
||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||
url: StrictStr = Field(..., title="URL of asset")
|
||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
# fmt: on
|
||||
|
||||
|
@ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel):
|
|||
name: StrictStr = Field(..., title="Name of command")
|
||||
help: Optional[StrictStr] = Field(None, title="Command description")
|
||||
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
|
||||
deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
|
||||
outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
|
||||
outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
|
||||
deps: List[StrictStr] = Field([], title="File dependencies required by this command")
|
||||
outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
|
||||
outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
|
||||
no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
@ -246,7 +247,7 @@ class ProjectConfigSchema(BaseModel):
|
|||
# fmt: off
|
||||
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
|
||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||
# fmt: on
|
||||
|
||||
|
|
|
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
|||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||
dropout=0.1):
|
||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||
self.attrs["has_upper"] = has_upper
|
||||
self.attrs["dropout_rate"] = dropout
|
||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||
if layers[1].get_dim("nP") >= 2:
|
||||
activation = "maxout"
|
||||
|
@ -243,6 +245,13 @@ class ParserStepModel(Model):
|
|||
for class_ in unseen_classes:
|
||||
self._class_mask[class_] = 0.
|
||||
|
||||
def clear_memory(self):
|
||||
del self.tokvecs
|
||||
del self.bp_tokvecs
|
||||
del self.state2vec
|
||||
del self.backprops
|
||||
del self._class_mask
|
||||
|
||||
@property
|
||||
def nO(self):
|
||||
if self.attrs["has_upper"]:
|
||||
|
@ -271,6 +280,19 @@ class ParserStepModel(Model):
|
|||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||
if isinstance(self.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
self.backprops.append((
|
||||
util.get_async(self.cuda_stream, token_ids),
|
||||
util.get_async(self.cuda_stream, d_vector),
|
||||
get_d_tokvecs
|
||||
))
|
||||
else:
|
||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
|
||||
|
||||
def finish_steps(self, golds):
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
|
@ -289,11 +311,17 @@ class ParserStepModel(Model):
|
|||
self.bp_tokvecs(d_tokvecs[:-1])
|
||||
return d_tokvecs
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
def step_forward(model: ParserStepModel, states, is_train):
|
||||
token_ids = model.get_token_ids(states)
|
||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||
mask = None
|
||||
if model.attrs["has_upper"]:
|
||||
dropout_rate = model.attrs["dropout_rate"]
|
||||
if is_train and dropout_rate > 0:
|
||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||
vector *= mask
|
||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
|
@ -305,16 +333,9 @@ def step_forward(model: ParserStepModel, states, is_train):
|
|||
# Zero vectors for unseen classes
|
||||
d_scores *= model._class_mask
|
||||
d_vector = get_d_vector(d_scores)
|
||||
if isinstance(model.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
model.backprops.append((
|
||||
util.get_async(model.cuda_stream, token_ids),
|
||||
util.get_async(model.cuda_stream, d_vector),
|
||||
get_d_tokvecs
|
||||
))
|
||||
else:
|
||||
model.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
||||
return None
|
||||
return scores, backprop_parser_step
|
||||
|
||||
|
@ -437,7 +458,7 @@ cdef class precompute_hiddens:
|
|||
sum_state_features(<float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector = state_vector + self.bias
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector_ids):
|
||||
|
|
|
@ -65,7 +65,6 @@ cdef class Parser:
|
|||
self.set_output(self.moves.n_moves)
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
||||
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
|
||||
self._multitasks = []
|
||||
for multitask in cfg.get("multitasks", []):
|
||||
self.add_multitask_objective(multitask)
|
||||
|
@ -154,7 +153,7 @@ cdef class Parser:
|
|||
doc (Doc): The document to be processed.
|
||||
"""
|
||||
states = self.predict([doc])
|
||||
self.set_annotations([doc], states, tensors=None)
|
||||
self.set_annotations([doc], states)
|
||||
return doc
|
||||
|
||||
def pipe(self, docs, int batch_size=256):
|
||||
|
@ -171,7 +170,7 @@ cdef class Parser:
|
|||
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
|
||||
subbatch = list(subbatch)
|
||||
parse_states = self.predict(subbatch)
|
||||
self.set_annotations(subbatch, parse_states, tensors=None)
|
||||
self.set_annotations(subbatch, parse_states)
|
||||
yield from batch_in_order
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -201,6 +200,8 @@ cdef class Parser:
|
|||
with nogil:
|
||||
self._parseC(&states[0],
|
||||
weights, sizes)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return batch
|
||||
|
||||
cdef void _parseC(self, StateC** states,
|
||||
|
@ -223,7 +224,7 @@ cdef class Parser:
|
|||
unfinished.clear()
|
||||
free_activations(&activations)
|
||||
|
||||
def set_annotations(self, docs, states, tensors=None):
|
||||
def set_annotations(self, docs, states):
|
||||
cdef StateClass state
|
||||
cdef Doc doc
|
||||
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||
|
@ -264,7 +265,7 @@ cdef class Parser:
|
|||
states[i].push_hist(guess)
|
||||
free(is_valid)
|
||||
|
||||
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||
cdef StateClass state
|
||||
if losses is None:
|
||||
losses = {}
|
||||
|
@ -280,11 +281,12 @@ cdef class Parser:
|
|||
[eg.predicted for eg in examples])
|
||||
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
||||
# Chop sequences into lengths of this many transitions, to make the
|
||||
# batch uniform length. We randomize this to overfit less.
|
||||
# batch uniform length.
|
||||
# We used to randomize this, but it's not clear that actually helps?
|
||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
||||
states, golds, max_steps = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=numpy.random.choice(range(5, cut_size))
|
||||
max_length=cut_size
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
|
@ -292,24 +294,15 @@ cdef class Parser:
|
|||
if not states:
|
||||
return losses
|
||||
all_states = list(states)
|
||||
states_golds = zip(states, golds)
|
||||
for _ in range(max_steps):
|
||||
if not states_golds:
|
||||
break
|
||||
states_golds = list(zip(states, golds))
|
||||
while states_golds:
|
||||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||
if self.cfg["normalize_gradients_with_batch_size"]:
|
||||
# We have to be very careful how we do this, because of the way we
|
||||
# cut up the batch. We subdivide long sequences. If we normalize
|
||||
# naively, we end up normalizing by sequence length, which
|
||||
# is bad: that would mean that states in long sequences
|
||||
# consistently get smaller gradients. Imagine if we have two
|
||||
# sequences, one length 1000, one length 20. If we cut up
|
||||
# the 1k sequence so that we have a "batch" of 50 subsequences,
|
||||
# we don't want the gradients to get 50 times smaller!
|
||||
d_scores /= n_examples
|
||||
|
||||
# Note that the gradient isn't normalized by the batch size
|
||||
# here, because our "samples" are really the states...But we
|
||||
# can't normalize by the number of states either, as then we'd
|
||||
# be getting smaller gradients for states in long sequences.
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
|
@ -321,6 +314,13 @@ cdef class Parser:
|
|||
if set_annotations:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
self.set_annotations(docs, all_states)
|
||||
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||
# removing these in time if we don't explicitly delete? It's confusing.
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
del model
|
||||
return losses
|
||||
|
||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||
|
@ -344,7 +344,7 @@ cdef class Parser:
|
|||
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||
set_dropout_rate(self.model, 0.0)
|
||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
||||
model, finish_update = self.model.begin_update(docs)
|
||||
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||
n_scores = 0.
|
||||
loss = 0.
|
||||
while states:
|
||||
|
@ -360,10 +360,16 @@ cdef class Parser:
|
|||
states = [state for state in states if not state.is_final()]
|
||||
n_scores += d_scores.size
|
||||
# Do the backprop
|
||||
finish_update(docs)
|
||||
backprop_tok2vec(docs)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
losses[self.name] += loss / n_scores
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
tutor.clear_memory()
|
||||
del model
|
||||
del tutor
|
||||
return losses
|
||||
|
||||
def get_gradients(self):
|
||||
|
@ -407,6 +413,7 @@ cdef class Parser:
|
|||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
|
@ -525,21 +532,25 @@ cdef class Parser:
|
|||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||
states = []
|
||||
golds = []
|
||||
kept = []
|
||||
max_length_seen = 0
|
||||
for state, eg in zip(all_states, examples):
|
||||
if self.moves.has_gold(eg) and not state.is_final():
|
||||
gold = self.moves.init_gold(state, eg)
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
kept.append((eg, state, gold, oracle_actions))
|
||||
min_length = min(min_length, len(oracle_actions))
|
||||
max_length_seen = max(max_length, len(oracle_actions))
|
||||
if len(eg.x) < max_length:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
else:
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
kept.append((eg, state, gold, oracle_actions))
|
||||
min_length = min(min_length, len(oracle_actions))
|
||||
max_length_seen = max(max_length, len(oracle_actions))
|
||||
if not kept:
|
||||
return [], [], 0
|
||||
return states, golds, 0
|
||||
max_length = max(min_length, min(max_length, max_length_seen))
|
||||
states = []
|
||||
golds = []
|
||||
cdef int clas
|
||||
max_moves = 0
|
||||
for eg, state, gold, oracle_actions in kept:
|
||||
|
|
|
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
|||
|
||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||
assert contains_cycle(tree) is None
|
||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
||||
assert contains_cycle(cyclic_tree) == {3, 4, 5}
|
||||
assert contains_cycle(partial_tree) is None
|
||||
assert contains_cycle(multirooted_tree) is None
|
||||
|
||||
|
|
|
@ -198,10 +198,10 @@ def test_overfitting_IO():
|
|||
nlp.add_pipe(parser)
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
for i in range(50):
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["parser"] < 0.00001
|
||||
assert losses["parser"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like securities."
|
||||
|
|
|
@ -38,6 +38,11 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# add some cases where SENT_START == -1
|
||||
train_examples[0].reference[10].is_sent_start = False
|
||||
train_examples[1].reference[1].is_sent_start = False
|
||||
train_examples[1].reference[11].is_sent_start = False
|
||||
|
||||
nlp.add_pipe(senter)
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ def test_overfitting_IO():
|
|||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
textcat = nlp.create_pipe("textcat")
|
||||
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True})
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
|
|
@ -23,6 +23,7 @@ def test_issue2070():
|
|||
assert len(doc) == 11
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2179():
|
||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||
nlp = Italian()
|
||||
|
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
|
|||
assert len(matches) == 3
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2482():
|
||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||
nlp = Italian()
|
||||
|
|
|
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
|
|||
assert doc[0].like_num
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2800():
|
||||
"""Test issue that arises when too many labels are added to NER model.
|
||||
Used to cause segfault.
|
||||
"""
|
||||
nlp = English()
|
||||
train_data = []
|
||||
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
|
||||
train_data.extend(
|
||||
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
|
||||
)
|
||||
entity_types = [str(i) for i in range(1000)]
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
|
|
|
@ -88,6 +88,7 @@ def test_issue3199():
|
|||
assert list(doc[0:3].noun_chunks) == []
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3209():
|
||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||
mapped to classes incorrectly after loading the model, when the labels
|
||||
|
|
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
|
@ -0,0 +1,472 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import EntityRuler, DependencyParser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
from spacy import displacy, load
|
||||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.errors import MatchPatternError
|
||||
from spacy.util import minibatch
|
||||
from spacy.gold import Example
|
||||
from spacy.lang.hi import Hindi
|
||||
from spacy.lang.es import Spanish
|
||||
from spacy.lang.en import English
|
||||
from spacy.attrs import IS_ALPHA
|
||||
from thinc.api import compounding
|
||||
import spacy
|
||||
import srsly
|
||||
import numpy
|
||||
|
||||
from ..util import make_tempdir, get_doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
||||
|
||||
|
||||
def test_issue_3526_1(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert ruler.overwrite
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.overwrite == ruler.overwrite
|
||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||
|
||||
|
||||
def test_issue_3526_2(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_issue_3526_3(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
with make_tempdir() as tmpdir:
|
||||
out_file = tmpdir / "entity_ruler"
|
||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue_3526_4(en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||
nlp.add_pipe(ruler)
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
ruler = nlp.get_pipe("entity_ruler")
|
||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert ruler.overwrite is True
|
||||
nlp2 = load(tmpdir)
|
||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert new_ruler.overwrite is True
|
||||
|
||||
|
||||
def test_issue3531():
|
||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||
example_dep = {
|
||||
"words": [
|
||||
{"text": "But", "tag": "CCONJ"},
|
||||
{"text": "Google", "tag": "PROPN"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "starting", "tag": "VERB"},
|
||||
{"text": "from", "tag": "ADP"},
|
||||
{"text": "behind.", "tag": "ADV"},
|
||||
],
|
||||
"arcs": [
|
||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||
],
|
||||
}
|
||||
example_ent = {
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
}
|
||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||
assert dep_html
|
||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||
assert ent_html
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = numpy.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||||
|
||||
|
||||
def test_issue3549(en_vocab):
|
||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
matcher.add("GOOD", [pattern])
|
||||
with pytest.raises(MatchPatternError):
|
||||
matcher.add("BAD", [[{"X": "Y"}]])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
||||
|
||||
|
||||
def test_issue3611():
|
||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
nlp = spacy.blank("en")
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||
nlp = Hindi()
|
||||
doc = nlp("hi. how हुए. होटल, होटल")
|
||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||
assert [token.text for token in doc] == expected
|
||||
|
||||
|
||||
def test_issue3803():
|
||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||
nlp = Spanish()
|
||||
text = "2 dos 1000 mil 12 doce"
|
||||
doc = nlp(text)
|
||||
|
||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" in parser.labels
|
||||
|
||||
|
||||
def test_issue3839(en_vocab):
|
||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||
matcher = Matcher(en_vocab)
|
||||
match_id = "PATTERN"
|
||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
matcher.add(match_id, [pattern1])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add(match_id, [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence",
|
||||
[
|
||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||
],
|
||||
)
|
||||
def test_issue3869(sentence):
|
||||
"""Test that the Doc's count_by function works consistently"""
|
||||
nlp = English()
|
||||
doc = nlp(sentence)
|
||||
count = 0
|
||||
for token in doc:
|
||||
count += token.is_alpha
|
||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||
|
||||
|
||||
def test_issue3879(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
assert len(doc) == 5
|
||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [pattern])
|
||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
Fixed in v7.0.5 of Thinc.
|
||||
"""
|
||||
texts = ["hello", "world", "", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("parser").add_label("dep")
|
||||
nlp.get_pipe("ner").add_label("PERSON")
|
||||
nlp.get_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
||||
|
||||
|
||||
def test_issue3951(en_vocab):
|
||||
"""Test that combinations of optional rules are matched correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"LOWER": "hello"},
|
||||
{"LOWER": "this", "OP": "?"},
|
||||
{"OP": "?"},
|
||||
{"LOWER": "world"},
|
||||
]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
assert doc2[0].pos_ == "NOUN"
|
||||
|
||||
|
||||
def test_issue3962(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
# head set to itself, being the new artificial root
|
||||
assert doc2[0].head.text == "jests"
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||
assert doc2[3].dep_ == "dep"
|
||||
# We should still have 1 sentence
|
||||
assert len(list(doc2.sents)) == 1
|
||||
span3 = doc[6:9] # "never felt a"
|
||||
doc3 = span3.as_doc()
|
||||
doc3_json = doc3.to_json()
|
||||
assert doc3_json
|
||||
assert doc3[0].head.text == "felt"
|
||||
assert doc3[0].dep_ == "neg"
|
||||
assert doc3[1].head.text == "felt"
|
||||
assert doc3[1].dep_ == "ROOT"
|
||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||
assert doc3[2].dep_ == "dep"
|
||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||
assert len(list(doc3.sents)) == 1
|
||||
|
||||
|
||||
def test_issue3962_long(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
# head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].head.text == "jests"
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
# head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].head.text == "They"
|
||||
assert doc2[4].dep_ == "dep"
|
||||
# head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].head.text == "They"
|
||||
assert doc2[4].dep_ == "dep"
|
||||
# We should still have 2 sentences
|
||||
sents = list(doc2.sents)
|
||||
assert len(sents) == 2
|
||||
assert sents[0].text == "jests at scars ."
|
||||
assert sents[1].text == "They never"
|
||||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert len(matches) == 2
|
||||
|
||||
# We should have a match for each of the two rules
|
||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||
assert "A" in found_ids
|
||||
assert "B" in found_ids
|
|
@ -1,8 +0,0 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
|
@ -1,85 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Span
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy import load
|
||||
import srsly
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patterns():
|
||||
return [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def add_ent():
|
||||
def add_ent_component(doc):
|
||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
||||
return doc
|
||||
|
||||
return add_ent_component
|
||||
|
||||
|
||||
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert ruler.overwrite
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.overwrite == ruler.overwrite
|
||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||
|
||||
|
||||
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
with make_tempdir() as tmpdir:
|
||||
out_file = tmpdir / "entity_ruler"
|
||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
|
||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||
nlp.add_pipe(ruler)
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
ruler = nlp.get_pipe("entity_ruler")
|
||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert ruler.overwrite is True
|
||||
nlp2 = load(tmpdir)
|
||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert new_ruler.overwrite is True
|
|
@ -1,30 +0,0 @@
|
|||
from spacy import displacy
|
||||
|
||||
|
||||
def test_issue3531():
|
||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||
example_dep = {
|
||||
"words": [
|
||||
{"text": "But", "tag": "CCONJ"},
|
||||
{"text": "Google", "tag": "PROPN"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "starting", "tag": "VERB"},
|
||||
{"text": "from", "tag": "ADP"},
|
||||
{"text": "behind.", "tag": "ADV"},
|
||||
],
|
||||
"arcs": [
|
||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||
],
|
||||
}
|
||||
example_ent = {
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
}
|
||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||
assert dep_html
|
||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||
assert ent_html
|
|
@ -1,44 +0,0 @@
|
|||
from spacy.tokens import Doc
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = np.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
@ -1,12 +0,0 @@
|
|||
import pytest
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.errors import MatchPatternError
|
||||
|
||||
|
||||
def test_issue3549(en_vocab):
|
||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
matcher.add("GOOD", [pattern])
|
||||
with pytest.raises(MatchPatternError):
|
||||
matcher.add("BAD", [[{"X": "Y"}]])
|
|
@ -1,14 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
|
@ -1,45 +0,0 @@
|
|||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
def test_issue3611():
|
||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
|
@ -1,9 +0,0 @@
|
|||
from spacy.lang.hi import Hindi
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||
nlp = Hindi()
|
||||
doc = nlp("hi. how हुए. होटल, होटल")
|
||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||
assert [token.text for token in doc] == expected
|
|
@ -1,10 +0,0 @@
|
|||
from spacy.lang.es import Spanish
|
||||
|
||||
|
||||
def test_issue3803():
|
||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||
nlp = Spanish()
|
||||
text = "2 dos 1000 mil 12 doce"
|
||||
doc = nlp(text)
|
||||
|
||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.pipeline.pipes import DependencyParser
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
|
||||
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" in parser.labels
|
|
@ -1,18 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3839(en_vocab):
|
||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||
matcher = Matcher(en_vocab)
|
||||
match_id = "PATTERN"
|
||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
matcher.add(match_id, [pattern1])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add(match_id, [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
@ -1,25 +0,0 @@
|
|||
import pytest
|
||||
from spacy.attrs import IS_ALPHA
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence",
|
||||
[
|
||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||
],
|
||||
)
|
||||
def test_issue3869(sentence):
|
||||
"""Test that the Doc's count_by function works consistently"""
|
||||
nlp = English()
|
||||
doc = nlp(sentence)
|
||||
|
||||
count = 0
|
||||
for token in doc:
|
||||
count += token.is_alpha
|
||||
|
||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
|
@ -1,11 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3879(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
assert len(doc) == 5
|
||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [pattern])
|
||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
|
@ -1,21 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
Fixed in v7.0.5 of Thinc.
|
||||
"""
|
||||
texts = ["hello", "world", "", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("parser").add_label("dep")
|
||||
nlp.get_pipe("ner").add_label("PERSON")
|
||||
nlp.get_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
|
@ -1,12 +0,0 @@
|
|||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
|
@ -1,17 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3951(en_vocab):
|
||||
"""Test that combinations of optional rules are matched correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"LOWER": "hello"},
|
||||
{"LOWER": "this", "OP": "?"},
|
||||
{"OP": "?"},
|
||||
{"LOWER": "world"},
|
||||
]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
|
@ -1,26 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
|
||||
assert doc2[0].pos_ == "NOUN"
|
|
@ -1,117 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
text = "He jests at scars, that never felt a wound."
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
deps = [
|
||||
"nsubj",
|
||||
"ccomp",
|
||||
"prep",
|
||||
"pobj",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"neg",
|
||||
"ROOT",
|
||||
"det",
|
||||
"dobj",
|
||||
"punct",
|
||||
]
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_issue3962(doc):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||
assert doc2[3].dep_ == "dep"
|
||||
|
||||
# We should still have 1 sentence
|
||||
assert len(list(doc2.sents)) == 1
|
||||
|
||||
span3 = doc[6:9] # "never felt a"
|
||||
doc3 = span3.as_doc()
|
||||
doc3_json = doc3.to_json()
|
||||
assert doc3_json
|
||||
|
||||
assert doc3[0].head.text == "felt"
|
||||
assert doc3[0].dep_ == "neg"
|
||||
assert doc3[1].head.text == "felt"
|
||||
assert doc3[1].dep_ == "ROOT"
|
||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||
assert doc3[2].dep_ == "dep"
|
||||
|
||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||
assert len(list(doc3.sents)) == 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def two_sent_doc(en_tokenizer):
|
||||
text = "He jests at scars. They never felt a wound."
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
deps = [
|
||||
"nsubj",
|
||||
"ROOT",
|
||||
"prep",
|
||||
"pobj",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"neg",
|
||||
"ROOT",
|
||||
"det",
|
||||
"dobj",
|
||||
"punct",
|
||||
]
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_issue3962_long(two_sent_doc):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
|
||||
# We should still have 2 sentences
|
||||
sents = list(doc2.sents)
|
||||
assert len(sents) == 2
|
||||
assert sents[0].text == "jests at scars ."
|
||||
assert sents[1].text == "They never"
|
|
@ -1,19 +0,0 @@
|
|||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert len(matches) == 2
|
||||
|
||||
# We should have a match for each of the two rules
|
||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||
assert "A" in found_ids
|
||||
assert "B" in found_ids
|
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
|
@ -0,0 +1,469 @@
|
|||
import pytest
|
||||
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.matcher import PhraseMatcher, Matcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.gold import Example, Corpus
|
||||
from spacy.gold.converters import json2docs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch, ensure_path, load_model
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lang.el import Greek
|
||||
from spacy.language import Language
|
||||
import spacy
|
||||
from thinc.api import compounding
|
||||
from collections import defaultdict
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern1])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||
pattern2[0].norm_ = "c"
|
||||
pattern2[1].norm_ = "d"
|
||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
||||
|
||||
def test_issue4030():
|
||||
""" Test whether textcat works fine with empty doc """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
nlp = spacy.blank("en")
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
assert doc.cats["inoffensive"] == 0.0
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = load_model(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
||||
|
||||
def test_issue4054(en_vocab):
|
||||
"""Test that a new blank model can be made with a vocab from file,
|
||||
and that serialization does not drop the language at any point."""
|
||||
nlp1 = English()
|
||||
vocab1 = nlp1.vocab
|
||||
with make_tempdir() as d:
|
||||
vocab_dir = ensure_path(d / "vocab")
|
||||
if not vocab_dir.exists():
|
||||
vocab_dir.mkdir()
|
||||
vocab1.to_disk(vocab_dir)
|
||||
vocab2 = Vocab().from_disk(vocab_dir)
|
||||
print("lang", vocab2.lang)
|
||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||
nlp_dir = ensure_path(d / "nlp")
|
||||
if not nlp_dir.exists():
|
||||
nlp_dir.mkdir()
|
||||
nlp2.to_disk(nlp_dir)
|
||||
nlp3 = load_model(nlp_dir)
|
||||
assert nlp3.lang == "en"
|
||||
|
||||
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc3)) == 2 # works
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
doc_bytes = doc.to_bytes()
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
assert actual == pos
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
nlp.tokenizer = new_tokenizer
|
||||
|
||||
test_string = "Test c."
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
||||
|
||||
|
||||
def test_multiple_predictions():
|
||||
class DummyPipe(Pipe):
|
||||
def __init__(self):
|
||||
self.model = "dummy_model"
|
||||
|
||||
def predict(self, docs):
|
||||
return ([1, 2, 3], [4, 5, 6])
|
||||
|
||||
def set_annotations(self, docs, scores):
|
||||
return docs
|
||||
|
||||
nlp = Language()
|
||||
doc = nlp.make_doc("foo")
|
||||
dummy_pipe = DummyPipe()
|
||||
dummy_pipe(doc)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
nlp.add_pipe(tagger)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
|
||||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||
|
||||
|
||||
def test_issue4373():
|
||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||
matcher = Matcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
matcher = PhraseMatcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
json_data = {
|
||||
"id": 0,
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "How", "ner": "O"},
|
||||
{"id": 1, "orth": "should", "ner": "O"},
|
||||
{"id": 2, "orth": "I", "ner": "O"},
|
||||
{"id": 3, "orth": "cook", "ner": "O"},
|
||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||
{"id": 5, "orth": "in", "ner": "O"},
|
||||
{"id": 6, "orth": "an", "ner": "O"},
|
||||
{"id": 7, "orth": "oven", "ner": "O"},
|
||||
{"id": 8, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 9, "orth": "\n", "ner": "O"},
|
||||
{"id": 10, "orth": "I", "ner": "O"},
|
||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||
{"id": 12, "orth": "heard", "ner": "O"},
|
||||
{"id": 13, "orth": "of", "ner": "O"},
|
||||
{"id": 14, "orth": "people", "ner": "O"},
|
||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||
{"id": 17, "orth": "in", "ner": "O"},
|
||||
{"id": 18, "orth": "an", "ner": "O"},
|
||||
{"id": 19, "orth": "oven", "ner": "O"},
|
||||
{"id": 20, "orth": ".", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 1.0},
|
||||
{"label": "not_baking", "value": 0.0},
|
||||
],
|
||||
},
|
||||
{
|
||||
"raw": "What is the difference between white and brown eggs?\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "What", "ner": "O"},
|
||||
{"id": 1, "orth": "is", "ner": "O"},
|
||||
{"id": 2, "orth": "the", "ner": "O"},
|
||||
{"id": 3, "orth": "difference", "ner": "O"},
|
||||
{"id": 4, "orth": "between", "ner": "O"},
|
||||
{"id": 5, "orth": "white", "ner": "O"},
|
||||
{"id": 6, "orth": "and", "ner": "O"},
|
||||
{"id": 7, "orth": "brown", "ner": "O"},
|
||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||
{"id": 9, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 0.0},
|
||||
{"label": "not_baking", "value": 1.0},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
nlp = English()
|
||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
|
||||
train_data = list(corpus.train_dataset(nlp))
|
||||
assert len(train_data) == 2
|
||||
|
||||
split_train_data = []
|
||||
for eg in train_data:
|
||||
split_train_data.extend(eg.split_sents())
|
||||
assert len(split_train_data) == 4
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern1])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||
pattern2[0].norm_ = "c"
|
||||
pattern2[1].norm_ = "d"
|
||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
|
@ -1,50 +0,0 @@
|
|||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
def test_issue4030():
|
||||
""" Test whether textcat works fine with empty doc """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
assert doc.cats["inoffensive"] == 0.0
|
|
@ -1,85 +0,0 @@
|
|||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import ensure_path
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = spacy.load(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
|
@ -1,30 +0,0 @@
|
|||
from spacy.vocab import Vocab
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import ensure_path
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4054(en_vocab):
|
||||
"""Test that a new blank model can be made with a vocab from file,
|
||||
and that serialization does not drop the language at any point."""
|
||||
nlp1 = English()
|
||||
vocab1 = nlp1.vocab
|
||||
|
||||
with make_tempdir() as d:
|
||||
vocab_dir = ensure_path(d / "vocab")
|
||||
if not vocab_dir.exists():
|
||||
vocab_dir.mkdir()
|
||||
vocab1.to_disk(vocab_dir)
|
||||
|
||||
vocab2 = Vocab().from_disk(vocab_dir)
|
||||
print("lang", vocab2.lang)
|
||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||
|
||||
nlp_dir = ensure_path(d / "nlp")
|
||||
if not nlp_dir.exists():
|
||||
nlp_dir.mkdir()
|
||||
nlp2.to_disk(nlp_dir)
|
||||
nlp3 = spacy.load(nlp_dir)
|
||||
assert nlp3.lang == "en"
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc3)) == 2 # works
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
|
@ -1,28 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
doc_bytes = doc.to_bytes()
|
||||
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
|
||||
assert actual == pos
|
|
@ -1,46 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy import util
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
test_string = "Test c."
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = util.load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
nlp.tokenizer = new_tokenizer
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
|
@ -1,9 +0,0 @@
|
|||
from spacy.lang.el import Greek
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
|
@ -1,25 +0,0 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import Pipe
|
||||
|
||||
|
||||
class DummyPipe(Pipe):
|
||||
def __init__(self):
|
||||
self.model = "dummy_model"
|
||||
|
||||
def predict(self, docs):
|
||||
return ([1, 2, 3], [4, 5, 6])
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
return docs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return Language()
|
||||
|
||||
|
||||
def test_multiple_predictions(nlp):
|
||||
doc = nlp.make_doc("foo")
|
||||
dummy_pipe = DummyPipe()
|
||||
dummy_pipe(doc)
|
|
@ -1,47 +0,0 @@
|
|||
from collections import defaultdict
|
||||
|
||||
import pytest
|
||||
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
||||
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
||||
@pytest.mark.skip
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
|
@ -1,24 +0,0 @@
|
|||
from spacy.gold import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
|
||||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
nlp.add_pipe(tagger)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
@ -1,8 +0,0 @@
|
|||
from spacy.tokens import DocBin
|
||||
|
||||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
|
@ -1,10 +0,0 @@
|
|||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4373():
|
||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||
matcher = Matcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
matcher = PhraseMatcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
|
@ -1,98 +0,0 @@
|
|||
from spacy.gold import Corpus
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...gold.converters import json2docs
|
||||
from ...tokens import DocBin
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
nlp = English()
|
||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
|
||||
train_data = list(corpus.train_dataset(nlp))
|
||||
assert len(train_data) == 2
|
||||
|
||||
split_train_data = []
|
||||
for eg in train_data:
|
||||
split_train_data.extend(eg.split_sents())
|
||||
assert len(split_train_data) == 4
|
||||
|
||||
|
||||
json_data = {
|
||||
"id": 0,
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "How", "ner": "O"},
|
||||
{"id": 1, "orth": "should", "ner": "O"},
|
||||
{"id": 2, "orth": "I", "ner": "O"},
|
||||
{"id": 3, "orth": "cook", "ner": "O"},
|
||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||
{"id": 5, "orth": "in", "ner": "O"},
|
||||
{"id": 6, "orth": "an", "ner": "O"},
|
||||
{"id": 7, "orth": "oven", "ner": "O"},
|
||||
{"id": 8, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 9, "orth": "\n", "ner": "O"},
|
||||
{"id": 10, "orth": "I", "ner": "O"},
|
||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||
{"id": 12, "orth": "heard", "ner": "O"},
|
||||
{"id": 13, "orth": "of", "ner": "O"},
|
||||
{"id": 14, "orth": "people", "ner": "O"},
|
||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||
{"id": 17, "orth": "in", "ner": "O"},
|
||||
{"id": 18, "orth": "an", "ner": "O"},
|
||||
{"id": 19, "orth": "oven", "ner": "O"},
|
||||
{"id": 20, "orth": ".", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 1.0},
|
||||
{"label": "not_baking", "value": 0.0},
|
||||
],
|
||||
},
|
||||
{
|
||||
"raw": "What is the difference between white and brown eggs?\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "What", "ner": "O"},
|
||||
{"id": 1, "orth": "is", "ner": "O"},
|
||||
{"id": 2, "orth": "the", "ner": "O"},
|
||||
{"id": 3, "orth": "difference", "ner": "O"},
|
||||
{"id": 4, "orth": "between", "ner": "O"},
|
||||
{"id": 5, "orth": "white", "ner": "O"},
|
||||
{"id": 6, "orth": "and", "ner": "O"},
|
||||
{"id": 7, "orth": "brown", "ner": "O"},
|
||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||
{"id": 9, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 0.0},
|
||||
{"label": "not_baking", "value": 1.0},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
|
@ -0,0 +1,288 @@
|
|||
import pytest
|
||||
from mock import Mock
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.gold import Example
|
||||
from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.util import ensure_path, load_model_from_path
|
||||
import numpy
|
||||
import pickle
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
|
||||
|
||||
def test_issue4528(en_vocab):
|
||||
"""Test that user_data is correctly serialized in DocBin."""
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
# This is how extension attribute values are stored in the user data
|
||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
doc_bin.add(doc)
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||
)
|
||||
def test_gold_misaligned(en_tokenizer, text, words):
|
||||
doc = en_tokenizer(text)
|
||||
Example.from_dict(doc, {"words": words})
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
matches = matcher(doc)
|
||||
on_match_args = on_match.call_args
|
||||
assert on_match_args[0][3] == matches
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
input_data = """
|
||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||
2 This _ DET DT _ _ det _ _
|
||||
3 killing _ NOUN NN _ _ nsubj _ _
|
||||
4 of _ ADP IN _ _ case _ _
|
||||
5 a _ DET DT _ _ det _ _
|
||||
6 respected _ ADJ JJ _ _ amod _ _
|
||||
7 cleric _ NOUN NN _ _ nmod _ _
|
||||
8 will _ AUX MD _ _ aux _ _
|
||||
9 be _ AUX VB _ _ aux _ _
|
||||
10 causing _ VERB VBG _ _ root _ _
|
||||
11 us _ PRON PRP _ _ iobj _ _
|
||||
12 trouble _ NOUN NN _ _ dobj _ _
|
||||
13 for _ ADP IN _ _ case _ _
|
||||
14 years _ NOUN NNS _ _ nmod _ _
|
||||
15 to _ PART TO _ _ mark _ _
|
||||
16 come _ VERB VB _ _ acl _ _
|
||||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
conllu2docs(input_data)
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
kb.set_entities(
|
||||
entity_list=["Q1", "Q1"],
|
||||
freq_list=[32, 111],
|
||||
vector_list=[vector1, vector2],
|
||||
)
|
||||
assert kb.get_size_entities() == 1
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4725_1():
|
||||
""" Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4725_2():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
ruler = EntityRuler(
|
||||
nlp,
|
||||
patterns=[
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
],
|
||||
phrase_matcher_attr="LOWER",
|
||||
)
|
||||
nlp.add_pipe(ruler)
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
name = "my_pipe"
|
||||
|
||||
def __init__(self):
|
||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||
Doc.set_extension("my_ext", default=None)
|
||||
|
||||
def __call__(self, doc):
|
||||
gathered_ext = []
|
||||
for sent in doc.sents:
|
||||
sent_ext = self._get_my_ext(sent)
|
||||
sent._.set("my_ext", sent_ext)
|
||||
gathered_ext.append(sent_ext)
|
||||
|
||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _get_my_ext(span):
|
||||
return str(span.end)
|
||||
|
||||
|
||||
def test_issue4903():
|
||||
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
|
||||
macOS."""
|
||||
nlp = English()
|
||||
custom_component = CustomPipe()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(custom_component, after="sentencizer")
|
||||
|
||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||
docs = list(nlp.pipe(text, n_process=2))
|
||||
assert docs[0].text == "I like bananas."
|
||||
assert docs[1].text == "Do you like them?"
|
||||
assert docs[2].text == "No, I prefer wasabi."
|
||||
|
||||
|
||||
def test_issue4924():
|
||||
nlp = Language()
|
||||
example = Example.from_dict(nlp.make_doc(""), {})
|
||||
nlp.evaluate([example])
|
|
@ -1,16 +0,0 @@
|
|||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
|
||||
def test_issue4528(en_vocab):
|
||||
"""Test that user_data is correctly serialized in DocBin."""
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
# This is how extension attribute values are stored in the user data
|
||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
doc_bin.add(doc)
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
|
@ -1,11 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||
)
|
||||
def test_gold_misaligned(en_tokenizer, text, words):
|
||||
doc = en_tokenizer(text)
|
||||
Example.from_dict(doc, {"words": words})
|
|
@ -1,35 +0,0 @@
|
|||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
|
||||
matches = matcher(doc)
|
||||
|
||||
on_match_args = on_match.call_args
|
||||
|
||||
assert on_match_args[0][3] == matches
|
|
@ -1,62 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
|
@ -1,35 +0,0 @@
|
|||
import pytest
|
||||
|
||||
# TODO
|
||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
|
||||
input_data = """
|
||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||
2 This _ DET DT _ _ det _ _
|
||||
3 killing _ NOUN NN _ _ nsubj _ _
|
||||
4 of _ ADP IN _ _ case _ _
|
||||
5 a _ DET DT _ _ det _ _
|
||||
6 respected _ ADJ JJ _ _ amod _ _
|
||||
7 cleric _ NOUN NN _ _ nmod _ _
|
||||
8 will _ AUX MD _ _ aux _ _
|
||||
9 be _ AUX VB _ _ aux _ _
|
||||
10 causing _ VERB VBG _ _ root _ _
|
||||
11 us _ PRON PRP _ _ iobj _ _
|
||||
12 trouble _ NOUN NN _ _ dobj _ _
|
||||
13 for _ ADP IN _ _ case _ _
|
||||
14 years _ NOUN NNS _ _ nmod _ _
|
||||
15 to _ PART TO _ _ mark _ _
|
||||
16 come _ VERB VB _ _ acl _ _
|
||||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
pass
|
||||
# conllu2json(input_data)
|
|
@ -1,36 +0,0 @@
|
|||
import pytest
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.util import ensure_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
kb.set_entities(
|
||||
entity_list=["Q1", "Q1"],
|
||||
freq_list=[32, 111],
|
||||
vector_list=[vector1, vector2],
|
||||
)
|
||||
|
||||
assert kb.get_size_entities() == 1
|
||||
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
|
||||
assert kb2.get_size_entities() == 1
|
|
@ -1,20 +0,0 @@
|
|||
from spacy.util import load_model_from_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
@ -1,41 +0,0 @@
|
|||
import pickle
|
||||
import numpy
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_pickle_ner():
|
||||
""" Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
|
||||
|
||||
def test_issue4725():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
|
||||
ruler = EntityRuler(
|
||||
nlp,
|
||||
patterns=[
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
],
|
||||
phrase_matcher_attr="LOWER",
|
||||
)
|
||||
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
|
@ -1,40 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span, Doc
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
name = "my_pipe"
|
||||
|
||||
def __init__(self):
|
||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||
Doc.set_extension("my_ext", default=None)
|
||||
|
||||
def __call__(self, doc):
|
||||
gathered_ext = []
|
||||
for sent in doc.sents:
|
||||
sent_ext = self._get_my_ext(sent)
|
||||
sent._.set("my_ext", sent_ext)
|
||||
gathered_ext.append(sent_ext)
|
||||
|
||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _get_my_ext(span):
|
||||
return str(span.end)
|
||||
|
||||
|
||||
def test_issue4903():
|
||||
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
||||
|
||||
nlp = English()
|
||||
custom_component = CustomPipe()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(custom_component, after="sentencizer")
|
||||
|
||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||
docs = list(nlp.pipe(text, n_process=2))
|
||||
assert docs[0].text == "I like bananas."
|
||||
assert docs[1].text == "Do you like them?"
|
||||
assert docs[2].text == "No, I prefer wasabi."
|
|
@ -1,8 +0,0 @@
|
|||
from spacy.gold import Example
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
def test_issue4924():
|
||||
nlp = Language()
|
||||
example = Example.from_dict(nlp.make_doc(""), {})
|
||||
nlp.evaluate([example])
|
|
@ -1,6 +1,8 @@
|
|||
import pytest
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue5152():
|
||||
# Test that the comparison between a Span and a Token, goes well
|
||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||
|
@ -8,7 +10,6 @@ def test_issue5152():
|
|||
text = nlp("Talk about being boring!")
|
||||
text_var = nlp("Talk of being boring!")
|
||||
y = nlp("Let")
|
||||
|
||||
span = text[0:3] # Talk about being
|
||||
span_2 = text[0:3] # Talk about being
|
||||
span_3 = text_var[0:3] # Talk of being
|
||||
|
|
|
@ -63,7 +63,8 @@ def tagger():
|
|||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
return tagger
|
||||
|
||||
|
||||
|
|
31
spacy/tests/regression/test_issue5551.py
Normal file
31
spacy/tests/regression/test_issue5551.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.util import fix_random_seed
|
||||
|
||||
|
||||
def test_issue5551():
|
||||
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
|
||||
component = "textcat"
|
||||
pipe_cfg = {"exclusive_classes": False}
|
||||
|
||||
results = []
|
||||
for i in range(3):
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
example = (
|
||||
"Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
|
||||
{"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
|
||||
)
|
||||
nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True)
|
||||
pipe = nlp.get_pipe(component)
|
||||
for label in set(example[1]["cats"]):
|
||||
pipe.add_label(label)
|
||||
nlp.begin_training(component_cfg={component: pipe_cfg})
|
||||
|
||||
# Store the result of each iteration
|
||||
result = pipe.model.predict([nlp.make_doc(example[0])])
|
||||
results.append(list(result[0]))
|
||||
|
||||
# All results should be the same because of the fixed seed
|
||||
assert len(results) == 3
|
||||
assert results[0] == results[1]
|
||||
assert results[0] == results[2]
|
|
@ -1,3 +1,4 @@
|
|||
import numpy
|
||||
from spacy.errors import AlignmentError
|
||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||
|
@ -5,6 +6,7 @@ from spacy.gold import Corpus, docs_to_json
|
|||
from spacy.gold.example import Example
|
||||
from spacy.gold.converters import json2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.util import get_words_and_spaces, minibatch
|
||||
from thinc.api import compounding
|
||||
|
@ -153,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab):
|
|||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||
|
||||
|
||||
def test_example_constructor(en_vocab):
|
||||
words = ["I", "like", "stuff"]
|
||||
tags = ["NOUN", "VERB", "NOUN"]
|
||||
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
|
||||
predicted = Doc(en_vocab, words=words)
|
||||
reference = Doc(en_vocab, words=words)
|
||||
reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
||||
example = Example(predicted, reference)
|
||||
tags = example.get_aligned("TAG", as_string=True)
|
||||
assert tags == ["NOUN", "VERB", "NOUN"]
|
||||
|
||||
|
||||
def test_example_from_dict_tags(en_vocab):
|
||||
words = ["I", "like", "stuff"]
|
||||
tags = ["NOUN", "VERB", "NOUN"]
|
||||
predicted = Doc(en_vocab, words=words)
|
||||
example = Example.from_dict(predicted, {"TAGS": tags})
|
||||
tags = example.get_aligned("TAG", as_string=True)
|
||||
assert tags == ["NOUN", "VERB", "NOUN"]
|
||||
|
||||
|
||||
def test_example_from_dict_no_ner(en_vocab):
|
||||
words = ["a", "b", "c", "d"]
|
||||
spaces = [True, True, False, True]
|
||||
|
@ -272,72 +295,72 @@ def test_split_sentences(en_vocab):
|
|||
|
||||
|
||||
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
||||
words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||
spaces = [True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
prefix = "Mr. and Mrs. Smith flew to "
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
||||
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person
|
||||
(len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, True, True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
prefix = "Mr. and Mrs. Smith flew to "
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
||||
gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
||||
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||
words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
prefix = "Mr. and Mrs. Smith flew to "
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
||||
gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
||||
entities = [
|
||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
||||
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
||||
gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
@ -407,6 +430,49 @@ def test_biluo_spans(en_tokenizer):
|
|||
assert spans[1].label_ == "GPE"
|
||||
|
||||
|
||||
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
|
||||
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
|
||||
spaces = [True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [
|
||||
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||
ents_ref = example.reference.ents
|
||||
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
|
||||
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
|
||||
|
||||
|
||||
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
|
||||
text = "Mr and Mrs Smith flew to San Francisco Valley"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
|
||||
{"label": "LOC", "pattern": "San Francisco Valley"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
|
||||
prefix = "Mr and Mrs Smith flew to "
|
||||
entities = [
|
||||
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||
]
|
||||
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
|
||||
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
|
||||
|
||||
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
|
||||
ents_pred = example.predicted.ents
|
||||
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
|
||||
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
|
||||
|
||||
|
||||
def test_gold_ner_missing_tags(en_tokenizer):
|
||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||
|
@ -414,6 +480,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
|||
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
||||
|
||||
|
||||
def test_projectivize(en_tokenizer):
|
||||
doc = en_tokenizer("He pretty quickly walks away")
|
||||
heads = [3, 2, 3, 0, 2]
|
||||
example = Example.from_dict(doc, {"heads": heads})
|
||||
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||
assert proj_heads == [3, 2, 3, 0, 3]
|
||||
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||
|
||||
|
||||
def test_iob_to_biluo():
|
||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||
|
|
156
spacy/tests/test_models.py
Normal file
156
spacy/tests/test_models.py
Normal file
|
@ -0,0 +1,156 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
||||
from numpy.testing import assert_array_equal
|
||||
import numpy
|
||||
|
||||
from spacy.ml.models import build_Tok2Vec_model
|
||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
params = []
|
||||
for node in model.walk():
|
||||
for name in node.param_names:
|
||||
params.append(node.get_param(name).ravel())
|
||||
return node.ops.xp.concatenate(params)
|
||||
|
||||
|
||||
def get_docs():
|
||||
nlp = English()
|
||||
return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)]))
|
||||
|
||||
|
||||
def get_gradient(model, Y):
|
||||
if isinstance(Y, model.ops.xp.ndarray):
|
||||
dY = model.ops.alloc(Y.shape, dtype=Y.dtype)
|
||||
dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape)
|
||||
return dY
|
||||
elif isinstance(Y, List):
|
||||
return [get_gradient(model, y) for y in Y]
|
||||
else:
|
||||
raise ValueError(f"Could not get gradient for type {type(Y)}")
|
||||
|
||||
|
||||
def default_tok2vec():
|
||||
return build_Tok2Vec_model(**TOK2VEC_KWARGS)
|
||||
|
||||
|
||||
TOK2VEC_KWARGS = {
|
||||
"width": 96,
|
||||
"embed_size": 2000,
|
||||
"subword_features": True,
|
||||
"char_embed": False,
|
||||
"conv_depth": 4,
|
||||
"bilstm_depth": 0,
|
||||
"maxout_pieces": 4,
|
||||
"window_size": 1,
|
||||
"dropout": 0.1,
|
||||
"nM": 0,
|
||||
"nC": 0,
|
||||
"pretrained_vectors": None,
|
||||
}
|
||||
|
||||
TEXTCAT_KWARGS = {
|
||||
"width": 64,
|
||||
"embed_size": 2000,
|
||||
"pretrained_vectors": None,
|
||||
"exclusive_classes": False,
|
||||
"ngram_size": 1,
|
||||
"window_size": 1,
|
||||
"conv_depth": 2,
|
||||
"dropout": None,
|
||||
"nO": 7
|
||||
}
|
||||
|
||||
TEXTCAT_CNN_KWARGS = {
|
||||
"tok2vec": default_tok2vec(),
|
||||
"exclusive_classes": False,
|
||||
"nO": 13,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"seed,model_func,kwargs",
|
||||
[
|
||||
(0, build_Tok2Vec_model, TOK2VEC_KWARGS),
|
||||
(0, build_text_classifier, TEXTCAT_KWARGS),
|
||||
(0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS),
|
||||
],
|
||||
)
|
||||
def test_models_initialize_consistently(seed, model_func, kwargs):
|
||||
fix_random_seed(seed)
|
||||
model1 = model_func(**kwargs)
|
||||
model1.initialize()
|
||||
fix_random_seed(seed)
|
||||
model2 = model_func(**kwargs)
|
||||
model2.initialize()
|
||||
params1 = get_all_params(model1)
|
||||
params2 = get_all_params(model2)
|
||||
assert_array_equal(params1, params2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"seed,model_func,kwargs,get_X",
|
||||
[
|
||||
(0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
|
||||
(0, build_text_classifier, TEXTCAT_KWARGS, get_docs),
|
||||
(0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
|
||||
],
|
||||
)
|
||||
def test_models_predict_consistently(seed, model_func, kwargs, get_X):
|
||||
fix_random_seed(seed)
|
||||
model1 = model_func(**kwargs).initialize()
|
||||
Y1 = model1.predict(get_X())
|
||||
fix_random_seed(seed)
|
||||
model2 = model_func(**kwargs).initialize()
|
||||
Y2 = model2.predict(get_X())
|
||||
|
||||
if model1.has_ref("tok2vec"):
|
||||
tok2vec1 = model1.get_ref("tok2vec").predict(get_X())
|
||||
tok2vec2 = model2.get_ref("tok2vec").predict(get_X())
|
||||
for i in range(len(tok2vec1)):
|
||||
for j in range(len(tok2vec1[i])):
|
||||
assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j]))
|
||||
|
||||
if isinstance(Y1, numpy.ndarray):
|
||||
assert_array_equal(Y1, Y2)
|
||||
elif isinstance(Y1, List):
|
||||
assert len(Y1) == len(Y2)
|
||||
for y1, y2 in zip(Y1, Y2):
|
||||
assert_array_equal(y1, y2)
|
||||
else:
|
||||
raise ValueError(f"Could not compare type {type(Y1)}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"seed,dropout,model_func,kwargs,get_X",
|
||||
[
|
||||
(0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
|
||||
(0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs),
|
||||
(0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
|
||||
],
|
||||
)
|
||||
def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
||||
def get_updated_model():
|
||||
fix_random_seed(seed)
|
||||
optimizer = Adam(0.001)
|
||||
model = model_func(**kwargs).initialize()
|
||||
initial_params = get_all_params(model)
|
||||
set_dropout_rate(model, dropout)
|
||||
for _ in range(5):
|
||||
Y, get_dX = model.begin_update(get_X())
|
||||
dY = get_gradient(model, Y)
|
||||
_ = get_dX(dY)
|
||||
model.finish_update(optimizer)
|
||||
updated_params = get_all_params(model)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_array_equal(initial_params, updated_params)
|
||||
return model
|
||||
|
||||
model1 = get_updated_model()
|
||||
model2 = get_updated_model()
|
||||
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
31
spacy/tests/test_projects.py
Normal file
31
spacy/tests/test_projects.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import pytest
|
||||
from spacy.cli.project.util import validate_project_commands
|
||||
from spacy.schemas import ProjectConfigSchema, validate
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"config",
|
||||
[
|
||||
{"commands": [{"name": "a"}, {"name": "a"}]},
|
||||
{"commands": [{"name": "a"}], "workflows": {"a": []}},
|
||||
{"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
|
||||
],
|
||||
)
|
||||
def test_project_config_validation1(config):
|
||||
with pytest.raises(SystemExit):
|
||||
validate_project_commands(config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"config,n_errors",
|
||||
[
|
||||
({"commands": {"a": []}}, 1),
|
||||
({"commands": [{"help": "..."}]}, 1),
|
||||
({"commands": [{"name": "a", "extra": "b"}]}, 1),
|
||||
({"commands": [{"extra": "b"}]}, 2),
|
||||
({"commands": [{"name": "a", "deps": [123]}]}, 1),
|
||||
],
|
||||
)
|
||||
def test_project_config_validation2(config, n_errors):
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
assert len(errors) == n_errors
|
|
@ -803,7 +803,7 @@ cdef class Doc:
|
|||
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||
for id_ in attrs]
|
||||
if array.dtype != numpy.uint64:
|
||||
warnings.warn(Warnings.W101.format(type=array.dtype))
|
||||
warnings.warn(Warnings.W028.format(type=array.dtype))
|
||||
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(Errors.E032)
|
||||
|
|
|
@ -20,7 +20,6 @@ import subprocess
|
|||
from contextlib import contextmanager
|
||||
import tempfile
|
||||
import shutil
|
||||
import hashlib
|
||||
import shlex
|
||||
|
||||
try:
|
||||
|
@ -449,6 +448,16 @@ def split_command(command: str) -> List[str]:
|
|||
return shlex.split(command, posix=not is_windows)
|
||||
|
||||
|
||||
def join_command(command: List[str]) -> str:
|
||||
"""Join a command using shlex. shlex.join is only available for Python 3.8+,
|
||||
so we're using a workaround here.
|
||||
|
||||
command (List[str]): The command to join.
|
||||
RETURNS (str): The joined command
|
||||
"""
|
||||
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||
|
||||
|
||||
def run_command(command: Union[str, List[str]]) -> None:
|
||||
"""Run a command on the command line as a subprocess. If the subprocess
|
||||
returns a non-zero exit code, a system exit is performed.
|
||||
|
@ -501,23 +510,13 @@ def make_tempdir():
|
|||
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
||||
|
||||
|
||||
def get_hash(data) -> str:
|
||||
"""Get the hash for a JSON-serializable object.
|
||||
def is_cwd(path: Union[Path, str]) -> bool:
|
||||
"""Check whether a path is the current working directory.
|
||||
|
||||
data: The data to hash.
|
||||
RETURNS (str): The hash.
|
||||
path (Union[Path, str]): The directory path.
|
||||
RETURNS (bool): Whether the path is the current working directory.
|
||||
"""
|
||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||
return hashlib.md5(data_str).hexdigest()
|
||||
|
||||
|
||||
def get_checksum(path: Union[Path, str]) -> str:
|
||||
"""Get the checksum for a file given its file path.
|
||||
|
||||
path (Union[Path, str]): The file path.
|
||||
RETURNS (str): The checksum.
|
||||
"""
|
||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||
return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
|
||||
|
||||
|
||||
def is_in_jupyter():
|
||||
|
@ -722,6 +721,51 @@ def minibatch(items, size=8):
|
|||
yield list(batch)
|
||||
|
||||
|
||||
def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False):
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
for outer_batch in minibatch(docs, buffer):
|
||||
outer_batch = list(outer_batch)
|
||||
target_size = next(size_)
|
||||
for indices in _batch_by_length(outer_batch, target_size):
|
||||
subbatch = [outer_batch[i] for i in indices]
|
||||
padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
|
||||
if discard_oversize and padded_size >= target_size:
|
||||
pass
|
||||
else:
|
||||
yield subbatch
|
||||
|
||||
|
||||
def _batch_by_length(seqs, max_words):
|
||||
"""Given a list of sequences, return a batched list of indices into the
|
||||
list, where the batches are grouped by length, in descending order.
|
||||
|
||||
Batches may be at most max_words in size, defined as max sequence length * size.
|
||||
"""
|
||||
# Use negative index so we can get sort by position ascending.
|
||||
lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
|
||||
lengths_indices.sort()
|
||||
batches = []
|
||||
batch = []
|
||||
for length, i in lengths_indices:
|
||||
if not batch:
|
||||
batch.append(i)
|
||||
elif length * (len(batch) + 1) <= max_words:
|
||||
batch.append(i)
|
||||
else:
|
||||
batches.append(batch)
|
||||
batch = [i]
|
||||
if batch:
|
||||
batches.append(batch)
|
||||
# Check lengths match
|
||||
assert sum(len(b) for b in batches) == len(seqs)
|
||||
batches = [list(sorted(batch)) for batch in batches]
|
||||
batches.reverse()
|
||||
return batches
|
||||
|
||||
|
||||
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
||||
"""Create minibatches of roughly a given number of words. If any examples
|
||||
are longer than the specified batch length, they will appear in a batch by
|
||||
|
@ -768,7 +812,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
|||
|
||||
# yield the previous batch and start a new one. The new one gets the overflow examples.
|
||||
else:
|
||||
yield batch
|
||||
if batch:
|
||||
yield batch
|
||||
target_size = next(size_)
|
||||
tol_size = target_size * tolerance
|
||||
batch = overflow
|
||||
|
@ -788,15 +833,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
|||
|
||||
# this example does not fit with the previous overflow: start another new batch
|
||||
else:
|
||||
yield batch
|
||||
if batch:
|
||||
yield batch
|
||||
target_size = next(size_)
|
||||
tol_size = target_size * tolerance
|
||||
batch = [doc]
|
||||
batch_size = n_words
|
||||
|
||||
# yield the final batch
|
||||
batch.extend(overflow)
|
||||
if batch:
|
||||
batch.extend(overflow)
|
||||
yield batch
|
||||
|
||||
|
||||
|
|
|
@ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library
|
|||
source: spacy/ml/models
|
||||
---
|
||||
|
||||
TODO: write
|
||||
TODO: intro and how architectures work, link to
|
||||
[`registry`](/api/top-level#registry),
|
||||
[custom models](/usage/training#custom-models) usage etc.
|
||||
|
||||
## Parser architectures {source="spacy/ml/models/parser.py"}
|
||||
|
||||
### spacy.TransitionBasedParser.v1
|
||||
|
||||
<!-- TODO: intro -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TransitionBasedParser.v1"
|
||||
> nr_feature_tokens = 6
|
||||
> hidden_width = 64
|
||||
> maxout_pieces = 2
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | ----------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||
| `nr_feature_tokens` | int | |
|
||||
| `hidden_width` | int | |
|
||||
| `maxout_pieces` | int | |
|
||||
| `use_upper` | bool | |
|
||||
| `nO` | int | |
|
||||
|
|
|
@ -297,60 +297,41 @@ will not be available.
|
|||
|
||||
## Train {#train}
|
||||
|
||||
<!-- TODO: document new training -->
|
||||
|
||||
Train a model. Expects data in spaCy's
|
||||
[JSON format](/api/data-formats#json-input). On each epoch, a model will be
|
||||
saved out to the directory. Accuracy scores and model details will be added to a
|
||||
[`meta.json`](/usage/training#models-generating) to allow packaging the model
|
||||
using the [`package`](/api/cli#package) command.
|
||||
[binary format](/api/data-formats#training) and a
|
||||
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
||||
Will save out the best model from all epochs, as well as the final model. The
|
||||
`--code` argument can be used to provide a Python file that's imported before
|
||||
the training process starts. This lets you register
|
||||
[custom functions](/usage/training#custom-models) and architectures and refer to
|
||||
them in your config, all while still using spaCy's built-in `train` workflow. If
|
||||
you need to manage complex multi-step training workflows, check out the new
|
||||
[spaCy projects](/usage/projects).
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
As of spaCy v3.0, the `train` command doesn't take a long list of command-line
|
||||
arguments anymore and instead expects a single
|
||||
[`config.cfg` file](/usage/training#config) containing all settings for the
|
||||
pipeline, training process and hyperparameters.
|
||||
|
||||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
||||
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping]
|
||||
[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
|
||||
[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
|
||||
[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel]
|
||||
[--textcat-positive-label] [--verbose]
|
||||
$ python -m spacy train [train_path] [dev_path] [config_path] [--output]
|
||||
[--code] [--verbose]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | positional | Model language. |
|
||||
| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
|
||||
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
|
||||
| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. |
|
||||
| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
|
||||
| `--replace-components`, `-R` | flag | Replace components from the base model. |
|
||||
| `--vectors`, `-v` | option | Model to load vectors from. |
|
||||
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
|
||||
| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
|
||||
| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). |
|
||||
| `--use-gpu`, `-g` | option | GPU ID or `-1` for CPU only (default: `-1`). |
|
||||
| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
|
||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
||||
| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
|
||||
| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
|
||||
| `--width`, `-cw` <Tag variant="new">2.2.4</Tag> | option | Width of CNN layers of `Tok2Vec` component. |
|
||||
| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag> | option | Depth of CNN layers of `Tok2Vec` component. |
|
||||
| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag> | option | Window size for CNN layers of `Tok2Vec` component. |
|
||||
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag> | option | Maxout size for CNN layers of `Tok2Vec` component. |
|
||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag> | flag | Whether to use character-based embedding of `Tok2Vec` component. |
|
||||
| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag> | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). |
|
||||
| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag> | option | Number of embedding rows of `Tok2Vec` component. |
|
||||
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
|
||||
| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag> | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
|
||||
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
|
||||
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
|
||||
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
|
||||
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option | Location of JSON-formatted tag map. |
|
||||
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | model, pickle | A spaCy model on each epoch. |
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||
| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | model | The final model and the best model. |
|
||||
|
||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||
|
||||
|
@ -471,20 +452,20 @@ as separate files if the respective component is present in the model's
|
|||
pipeline.
|
||||
|
||||
```bash
|
||||
$ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit]
|
||||
[--gpu-id] [--gold-preproc] [--return-scores]
|
||||
$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
|
||||
[--displacy-limit] [--gpu-id] [--gold-preproc]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
|
||||
| `data_path` | positional | Location of JSON-formatted evaluation data. |
|
||||
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
||||
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
| `--return-scores`, `-R` | flag | Return dict containing model scores. |
|
||||
| **CREATES** | `stdout`, HTML | Training results and optional displaCy visualizations. |
|
||||
| Argument | Type | Description |
|
||||
| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
|
||||
| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
|
||||
| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
|
||||
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
||||
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
|
||||
|
||||
## Package {#package}
|
||||
|
||||
|
@ -504,15 +485,17 @@ so you don't have to run `python setup.py sdist` separately anymore.
|
|||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||
[--version] [--force]
|
||||
```
|
||||
|
||||
```bash
|
||||
### Example
|
||||
python -m spacy package /input /output
|
||||
cd /output/en_model-0.0.0
|
||||
pip install dist/en_model-0.0.0.tar.gz
|
||||
```
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> python -m spacy package /input /output
|
||||
> cd /output/en_model-0.0.0
|
||||
> pip install dist/en_model-0.0.0.tar.gz
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -525,18 +508,137 @@ pip install dist/en_model-0.0.0.tar.gz
|
|||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | directory | A Python package containing the spaCy model. |
|
||||
|
||||
## Project {#project}
|
||||
## Project {#project new="3"}
|
||||
|
||||
<!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design -->
|
||||
The `spacy project` CLI includes subcommands for working with
|
||||
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||
deploying custom spaCy models.
|
||||
|
||||
### project clone {#project-clone}
|
||||
|
||||
Clone a project template from a Git repository. Calls into `git` under the hood
|
||||
and uses the sparse checkout feature, so you're only downloading what you need.
|
||||
By default, spaCy's
|
||||
[project templates repo](https://github.com/explosion/projects) is used, but you
|
||||
can provide any other repo (public or private) that you have access to using the
|
||||
`--repo` option.
|
||||
|
||||
<!-- TODO: update example once we've decided on repo structure -->
|
||||
|
||||
```bash
|
||||
$ python -m spacy project clone [name] [dest] [--repo]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy project clone some_example
|
||||
> ```
|
||||
>
|
||||
> Clone from custom repo:
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
|
||||
| `dest` | positional | Where to clone the project. Defaults to current working directory. |
|
||||
| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). |
|
||||
|
||||
### project assets {#project-assets}
|
||||
|
||||
### project run-all {#project-run-all}
|
||||
Fetch project assets like datasets and pretrained weights. Assets are defined in
|
||||
the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
|
||||
`checksum` is provided, the file is only downloaded if no local file with the
|
||||
same checksum exists and spaCy will show an error if the checksum of the
|
||||
downloaded file doesn't match. If assets don't specify a `url` they're
|
||||
considered "private" and you have to take care of putting them into the
|
||||
destination directory yourself. If a local path is provided, the asset is copied
|
||||
into the current project.
|
||||
|
||||
```bash
|
||||
$ python -m spacy project assets [project_dir]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy project assets
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------- | ---------- | ----------------------------------------------------------------- |
|
||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. |
|
||||
|
||||
### project run {#project-run}
|
||||
|
||||
### project init {#project-init}
|
||||
Run a named command or workflow defined in the
|
||||
[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
|
||||
all commands in the workflow are run, in order. If commands define
|
||||
[dependencies or outputs](/usage/projects#deps-outputs), they will only be
|
||||
re-run if state has changed. For example, if the input dataset changes, a
|
||||
preprocessing command that depends on those files will be re-run.
|
||||
|
||||
### project update-dvc {#project-update-dvc}
|
||||
```bash
|
||||
$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy project run train
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| --------------- | ---------- | ----------------------------------------------------------------- |
|
||||
| `subcommand` | positional | Name of the command or workflow to run. |
|
||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
|
||||
| `--dry`, `-D` | flag | Perform a dry run and don't execute scripts. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
|
||||
### project dvc {#project-dvc}
|
||||
|
||||
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
||||
[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
|
||||
the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
|
||||
so you need to specify one workflow defined in the
|
||||
[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the
|
||||
first defined workflow is used. The DVC config will only be updated if the
|
||||
`project.yml` changed. For details, see the
|
||||
[DVC integration](/usage/projects#dvc) docs.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
This command requires DVC to be installed and initialized in the project
|
||||
directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
|
||||
You'll also need to add the assets you want to track with
|
||||
[`dvc add`](https://dvc.org/doc/command-reference/add).
|
||||
|
||||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> git init
|
||||
> dvc init
|
||||
> python -m spacy project dvc all
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
|
||||
| `--force`, `-F` | flag | Force-updating config file. |
|
||||
| `--verbose`, `-V` | flag | Print more output generated by DVC. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
|
|
|
@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to
|
|||
follow — only to succumb themselves. In short, just say no to optimizing your
|
||||
Python. If it's not fast enough the first time, just switch to Cython.
|
||||
|
||||
<Infobox title="📖 Resources">
|
||||
<Infobox title="Resources" emoji="📖">
|
||||
|
||||
- [Official Cython documentation](http://docs.cython.org/en/latest/)
|
||||
(cython.org)
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
title: Data formats
|
||||
teaser: Details on spaCy's input and output data formats
|
||||
menu:
|
||||
- ['Training data', 'training']
|
||||
- ['Training Data', 'training']
|
||||
- ['Training Config', 'config']
|
||||
- ['Vocabulary', 'vocab']
|
||||
---
|
||||
|
||||
|
@ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank:
|
|||
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
|
||||
```
|
||||
|
||||
## Training config {#config new="3"}
|
||||
|
||||
Config files define the training process and model pipeline and can be passed to
|
||||
[`spacy train`](/api/cli#train). They use
|
||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||
hood. For details on how to use training configs, see the
|
||||
[usage documentation](/usage/training#config).
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
The `@` syntax lets you refer to function names registered in the
|
||||
[function registry](/api/top-level#registry). For example,
|
||||
`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
|
||||
the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
|
||||
will be passed into that function as arguments. Those arguments depend on the
|
||||
registered function. See the [model architectures](/api/architectures) docs for
|
||||
API details.
|
||||
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
|
||||
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||
|
||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||
|
||||
To populate a model's vocabulary, you can use the
|
||||
|
|
|
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
|||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||
via the ID `"parser"`.
|
||||
|
||||
## DependencyParser.Model {#model tag="classmethod"}
|
||||
## Default config {#config}
|
||||
|
||||
Initialize a model for the pipe. The model should implement the
|
||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
||||
learning libraries.
|
||||
This is the default configuration used to initialize the model powering the
|
||||
pipeline component. See the [model architectures](/api/architectures)
|
||||
documentation for details on the architectures and their arguments and
|
||||
hyperparameters. To learn more about how to customize the config and train
|
||||
custom models, check out the [training config](/usage/training#config) docs.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------------------------- |
|
||||
| `**kwargs` | - | Parameters for initializing the model |
|
||||
| **RETURNS** | object | The initialized model. |
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg
|
||||
```
|
||||
|
||||
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe with default model
|
||||
> parser = nlp.create_pipe("parser")
|
||||
>
|
||||
> # Construction via create_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_parser"}}
|
||||
> parser = nlp.create_pipe("parser", config)
|
||||
>
|
||||
> # Construction from class with custom model from file
|
||||
> from spacy.pipeline import DependencyParser
|
||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||
> parser = DependencyParser(nlp.vocab, model)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe
|
||||
> parser = nlp.create_pipe("parser")
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import DependencyParser
|
||||
> parser = DependencyParser(nlp.vocab)
|
||||
> parser.from_disk("/path/to/model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
||||
| `**cfg` | - | Configuration parameters. |
|
||||
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `**cfg` | - | Configuration parameters. |
|
||||
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
||||
|
||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | ------------------------------------------------------ |
|
||||
| `stream` | iterable | A stream of documents. |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
|
||||
## DependencyParser.predict {#predict tag="method"}
|
||||
|
||||
|
@ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ---------------------------------------------- |
|
||||
| `docs` | iterable | The documents to predict. |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
|
||||
|
||||
## DependencyParser.set_annotations {#set_annotations tag="method"}
|
||||
|
@ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores.
|
|||
> parser.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | -------- | ---------------------------------------------------------- |
|
||||
| `docs` | iterable | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `DependencyParser.predict`. |
|
||||
| Name | Type | Description |
|
||||
| -------- | ------------------- | ---------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. |
|
||||
|
||||
## DependencyParser.update {#update tag="method"}
|
||||
|
||||
Learn from a batch of documents and gold-standard information, updating the
|
||||
pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
|
||||
model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||
[`get_loss`](/api/dependencyparser#get_loss).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> parser = DependencyParser(nlp.vocab)
|
||||
> losses = {}
|
||||
> parser = DependencyParser(nlp.vocab, parser_model)
|
||||
> optimizer = nlp.begin_training()
|
||||
> parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
||||
> losses = parser.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | A batch of documents to learn from. |
|
||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| Name | Type | Description |
|
||||
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
|
||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
|
||||
## DependencyParser.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -156,21 +162,20 @@ predicted scores.
|
|||
>
|
||||
> ```python
|
||||
> parser = DependencyParser(nlp.vocab)
|
||||
> scores = parser.predict([doc1, doc2])
|
||||
> loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores)
|
||||
> scores = parser.predict([eg.predicted for eg in examples])
|
||||
> loss, d_loss = parser.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ------------------------------------------------------------ |
|
||||
| `docs` | iterable | The batch of documents. |
|
||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
||||
| `scores` | - | Scores representing the model's predictions. |
|
||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
|
||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||
|
||||
Initialize the pipe for training, using data examples if available. If no model
|
||||
has been initialized yet, the model is added.
|
||||
Initialize the pipe for training, using data examples if available. Return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -180,16 +185,17 @@ has been initialized yet, the model is added.
|
|||
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. |
|
||||
| **RETURNS** | callable | An optimizer. |
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||
|
||||
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
Create an optimizer for the pipeline component.
|
||||
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
|
||||
component.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -198,9 +204,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = parser.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------- |
|
||||
| **RETURNS** | callable | The optimizer. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
|
||||
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
|
|
@ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
|||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||
via the ID `"entity_linker"`.
|
||||
|
||||
## EntityLinker.Model {#model tag="classmethod"}
|
||||
## Default config {#config}
|
||||
|
||||
Initialize a model for the pipe. The model should implement the
|
||||
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
|
||||
context encoder. Wrappers are under development for most major machine learning
|
||||
libraries.
|
||||
This is the default configuration used to initialize the model powering the
|
||||
pipeline component. See the [model architectures](/api/architectures)
|
||||
documentation for details on the architectures and their arguments and
|
||||
hyperparameters. To learn more about how to customize the config and train
|
||||
custom models, check out the [training config](/usage/training#config) docs.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------------------------- |
|
||||
| `**kwargs` | - | Parameters for initializing the model |
|
||||
| **RETURNS** | object | The initialized model. |
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg
|
||||
```
|
||||
|
||||
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe with default model
|
||||
> entity_linker = nlp.create_pipe("entity_linker")
|
||||
>
|
||||
> # Construction via create_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_el"}}
|
||||
> entity_linker = nlp.create_pipe("entity_linker", config)
|
||||
>
|
||||
> # Construction from class with custom model from file
|
||||
> from spacy.pipeline import EntityLinker
|
||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||
> entity_linker = EntityLinker(nlp.vocab, model)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe
|
||||
> entity_linker = nlp.create_pipe("entity_linker")
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import EntityLinker
|
||||
> entity_linker = EntityLinker(nlp.vocab)
|
||||
> entity_linker.from_disk("/path/to/model")
|
||||
> ```
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `**cfg` | - | Configuration parameters. |
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
||||
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to `128`. |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to `True`. |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`. |
|
||||
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
||||
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
||||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | ------------------------------------------------------ |
|
||||
| `stream` | iterable | A stream of documents. |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
|
||||
## EntityLinker.predict {#predict tag="method"}
|
||||
|
||||
|
@ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
|||
>
|
||||
> ```python
|
||||
> entity_linker = EntityLinker(nlp.vocab)
|
||||
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
|
||||
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | The documents to predict. |
|
||||
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ------------------------------------------------------------ |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. |
|
||||
|
||||
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -122,19 +125,18 @@ entities.
|
|||
>
|
||||
> ```python
|
||||
> entity_linker = EntityLinker(nlp.vocab)
|
||||
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
|
||||
> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
|
||||
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||
> entity_linker.set_annotations([doc1, doc2], kb_ids)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | The documents to modify. |
|
||||
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
||||
| `tensors` | iterable | The token representations used to predict the identifiers. |
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
||||
|
||||
## EntityLinker.update {#update tag="method"}
|
||||
|
||||
Learn from a batch of documents and gold-standard information, updating both the
|
||||
Learn from a batch of [`Example`](/api/example) objects, updating both the
|
||||
pipe's entity linking model and context encoder. Delegates to
|
||||
[`predict`](/api/entitylinker#predict) and
|
||||
[`get_loss`](/api/entitylinker#get_loss).
|
||||
|
@ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> entity_linker = EntityLinker(nlp.vocab)
|
||||
> losses = {}
|
||||
> entity_linker = EntityLinker(nlp.vocab, nel_model)
|
||||
> optimizer = nlp.begin_training()
|
||||
> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
||||
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | A batch of documents to learn from. |
|
||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
||||
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
|
||||
| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
|
||||
## EntityLinker.get_loss {#get_loss tag="method"}
|
||||
|
||||
Find the loss and gradient of loss for the entities in a batch of documents and
|
||||
their predicted scores.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> entity_linker = EntityLinker(nlp.vocab)
|
||||
> kb_ids, tensors = entity_linker.predict(docs)
|
||||
> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ------------------------------------------------------------ |
|
||||
| `docs` | iterable | The batch of documents. |
|
||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
||||
| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
|
||||
| `tensors` | iterable | The token representations used to predict the identifiers |
|
||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Type | Description |
|
||||
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). |
|
||||
| `sgd` | `Optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
|
||||
## EntityLinker.set_kb {#set_kb tag="method"}
|
||||
|
||||
|
@ -195,9 +177,9 @@ identifiers.
|
|||
|
||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||
|
||||
Initialize the pipe for training, using data examples if available. If no model
|
||||
has been initialized yet, the model is added. Before calling this method, a
|
||||
knowledge base should have been defined with
|
||||
Initialize the pipe for training, using data examples if available. Return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
|
||||
method, a knowledge base should have been defined with
|
||||
[`set_kb`](/api/entitylinker#set_kb).
|
||||
|
||||
> #### Example
|
||||
|
@ -209,12 +191,12 @@ knowledge base should have been defined with
|
|||
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
|
||||
| **RETURNS** | callable | An optimizer. |
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. |
|
||||
| **RETURNS** | `Optimizer` | An optimizer. | |
|
||||
|
||||
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -227,9 +209,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = entity_linker.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------- |
|
||||
| **RETURNS** | callable | The optimizer. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
|
||||
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
|
|
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
|||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||
via the ID `"ner"`.
|
||||
|
||||
## EntityRecognizer.Model {#model tag="classmethod"}
|
||||
## Default config {#config}
|
||||
|
||||
Initialize a model for the pipe. The model should implement the
|
||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
||||
learning libraries.
|
||||
This is the default configuration used to initialize the model powering the
|
||||
pipeline component. See the [model architectures](/api/architectures)
|
||||
documentation for details on the architectures and their arguments and
|
||||
hyperparameters. To learn more about how to customize the config and train
|
||||
custom models, check out the [training config](/usage/training#config) docs.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------------------------- |
|
||||
| `**kwargs` | - | Parameters for initializing the model |
|
||||
| **RETURNS** | object | The initialized model. |
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg
|
||||
```
|
||||
|
||||
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe
|
||||
> ner = nlp.create_pipe("ner")
|
||||
>
|
||||
> # Construction from class
|
||||
> # Construction via create_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_ner"}}
|
||||
> parser = nlp.create_pipe("ner", config)
|
||||
>
|
||||
> # Construction from class with custom model from file
|
||||
> from spacy.pipeline import EntityRecognizer
|
||||
> ner = EntityRecognizer(nlp.vocab)
|
||||
> ner.from_disk("/path/to/model")
|
||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||
> ner = EntityRecognizer(nlp.vocab, model)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
||||
| `**cfg` | - | Configuration parameters. |
|
||||
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `**cfg` | - | Configuration parameters. |
|
||||
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
||||
|
||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | ------------------------------------------------------ |
|
||||
| `stream` | iterable | A stream of documents. |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
|
||||
## EntityRecognizer.predict {#predict tag="method"}
|
||||
|
||||
|
@ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
|||
>
|
||||
> ```python
|
||||
> ner = EntityRecognizer(nlp.vocab)
|
||||
> scores, tensors = ner.predict([doc1, doc2])
|
||||
> scores = ner.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | The documents to predict. |
|
||||
| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
||||
|
||||
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores.
|
|||
>
|
||||
> ```python
|
||||
> ner = EntityRecognizer(nlp.vocab)
|
||||
> scores, tensors = ner.predict([doc1, doc2])
|
||||
> ner.set_annotations([doc1, doc2], scores, tensors)
|
||||
> scores = ner.predict([doc1, doc2])
|
||||
> ner.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ---------------------------------------------------------- |
|
||||
| `docs` | iterable | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
|
||||
| `tensors` | iterable | The token representations used to predict the scores. |
|
||||
| Name | Type | Description |
|
||||
| -------- | ------------------ | ---------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. |
|
||||
|
||||
## EntityRecognizer.update {#update tag="method"}
|
||||
|
||||
Learn from a batch of documents and gold-standard information, updating the
|
||||
pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
|
||||
model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||
[`get_loss`](/api/entityrecognizer#get_loss).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> ner = EntityRecognizer(nlp.vocab)
|
||||
> losses = {}
|
||||
> ner = EntityRecognizer(nlp.vocab, ner_model)
|
||||
> optimizer = nlp.begin_training()
|
||||
> ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
||||
> losses = ner.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | A batch of documents to learn from. |
|
||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| Name | Type | Description |
|
||||
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
|
||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
|
||||
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -157,21 +162,20 @@ predicted scores.
|
|||
>
|
||||
> ```python
|
||||
> ner = EntityRecognizer(nlp.vocab)
|
||||
> scores = ner.predict([doc1, doc2])
|
||||
> loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores)
|
||||
> scores = ner.predict([eg.predicted for eg in examples])
|
||||
> loss, d_loss = ner.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ------------------------------------------------------------ |
|
||||
| `docs` | iterable | The batch of documents. |
|
||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
||||
| `scores` | - | Scores representing the model's predictions. |
|
||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | `List[StateClass]` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
|
||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||
|
||||
Initialize the pipe for training, using data examples if available. If no model
|
||||
has been initialized yet, the model is added.
|
||||
Initialize the pipe for training, using data examples if available. Return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -181,12 +185,12 @@ has been initialized yet, the model is added.
|
|||
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
||||
| **RETURNS** | callable | An optimizer. |
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||
|
||||
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -199,9 +203,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = ner.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------- |
|
||||
| **RETURNS** | callable | The optimizer. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
|
||||
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user