mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-19 13:00:35 +03:00
Merge remote-tracking branch 'origin/develop' into rliaw-develop
This commit is contained in:
commit
3bccf8b954
|
@ -5,16 +5,16 @@
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||||
gold_preproc = false
|
gold_preproc = false
|
||||||
# Limitations on training document length or number of examples.
|
# Limitations on training document length or number of examples.
|
||||||
max_length = 5000
|
max_length = 3000
|
||||||
limit = 0
|
limit = 0
|
||||||
# Data augmentation
|
# Data augmentation
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
dropout = 0.2
|
dropout = 0.1
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 100000
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
max_steps = 20000
|
max_steps = 0
|
||||||
eval_frequency = 500
|
eval_frequency = 1000
|
||||||
# Other settings
|
# Other settings
|
||||||
seed = 0
|
seed = 0
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 1
|
||||||
|
@ -26,6 +26,7 @@ score_weights = {"ents_f": 1.0}
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
omit_extra_lookups = false
|
omit_extra_lookups = false
|
||||||
|
batch_by = "words"
|
||||||
|
|
||||||
[training.batch_size]
|
[training.batch_size]
|
||||||
@schedules = "compounding.v1"
|
@schedules = "compounding.v1"
|
||||||
|
@ -37,19 +38,13 @@ compound = 1.001
|
||||||
@optimizers = "Adam.v1"
|
@optimizers = "Adam.v1"
|
||||||
beta1 = 0.9
|
beta1 = 0.9
|
||||||
beta2 = 0.999
|
beta2 = 0.999
|
||||||
L2_is_weight_decay = false
|
L2_is_weight_decay = true
|
||||||
L2 = 1e-6
|
L2 = 0.01
|
||||||
grad_clip = 1.0
|
grad_clip = 1.0
|
||||||
use_averages = true
|
use_averages = true
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
#[optimizer.learn_rate]
|
|
||||||
#@schedules = "warmup_linear.v1"
|
|
||||||
#warmup_steps = 250
|
|
||||||
#total_steps = 20000
|
|
||||||
#initial_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
vectors = null
|
vectors = null
|
||||||
|
@ -58,8 +53,6 @@ vectors = null
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
learn_tokens = false
|
learn_tokens = false
|
||||||
min_action_freq = 1
|
min_action_freq = 1
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
[nlp.pipeline.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a2"
|
__version__ = "3.0.0a4"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
|
||||||
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||||
|
|
|
@ -11,12 +11,15 @@ from .profile import profile # noqa: F401
|
||||||
from .train import train_cli # noqa: F401
|
from .train import train_cli # noqa: F401
|
||||||
from .pretrain import pretrain # noqa: F401
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .debug_data import debug_data # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project import project_clone, project_assets, project_run # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
from .project import project_run_all # noqa: F401
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
from .project.run import project_run # noqa: F401
|
||||||
|
from .project.dvc import project_update_dvc # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli
|
DOCS: https://spacy.io/api/cli
|
||||||
"""
|
"""
|
||||||
|
PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
|
||||||
|
project templates. You'd typically start by cloning a project template to a local
|
||||||
|
directory and fetching its assets like datasets etc. See the project's
|
||||||
|
project.yml for the available commands.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
app = typer.Typer(name=NAME, help=HELP)
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
|
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||||
|
app.add_typer(project_cli)
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
|
|
@ -120,8 +120,12 @@ def convert(
|
||||||
no_print=silent,
|
no_print=silent,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
)
|
)
|
||||||
|
if file_type == "json":
|
||||||
|
data = [docs_to_json(docs)]
|
||||||
|
else:
|
||||||
|
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||||
if output_dir == "-":
|
if output_dir == "-":
|
||||||
_print_docs_to_stdout(docs, file_type)
|
_print_docs_to_stdout(data, file_type)
|
||||||
else:
|
else:
|
||||||
if input_loc != input_path:
|
if input_loc != input_path:
|
||||||
subpath = input_loc.relative_to(input_path)
|
subpath = input_loc.relative_to(input_path)
|
||||||
|
@ -129,24 +133,23 @@ def convert(
|
||||||
else:
|
else:
|
||||||
output_file = Path(output_dir) / input_loc.parts[-1]
|
output_file = Path(output_dir) / input_loc.parts[-1]
|
||||||
output_file = output_file.with_suffix(f".{file_type}")
|
output_file = output_file.with_suffix(f".{file_type}")
|
||||||
_write_docs_to_file(docs, output_file, file_type)
|
_write_docs_to_file(data, output_file, file_type)
|
||||||
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
||||||
|
|
||||||
|
|
||||||
def _print_docs_to_stdout(docs, output_type):
|
def _print_docs_to_stdout(data, output_type):
|
||||||
if output_type == "json":
|
if output_type == "json":
|
||||||
srsly.write_json("-", [docs_to_json(docs)])
|
srsly.write_json("-", data)
|
||||||
else:
|
else:
|
||||||
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
sys.stdout.buffer.write(data)
|
||||||
|
|
||||||
|
|
||||||
def _write_docs_to_file(docs, output_file, output_type):
|
def _write_docs_to_file(data, output_file, output_type):
|
||||||
if not output_file.parent.exists():
|
if not output_file.parent.exists():
|
||||||
output_file.parent.mkdir(parents=True)
|
output_file.parent.mkdir(parents=True)
|
||||||
if output_type == "json":
|
if output_type == "json":
|
||||||
srsly.write_json(output_file, [docs_to_json(docs)])
|
srsly.write_json(output_file, data)
|
||||||
else:
|
else:
|
||||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
|
|
||||||
|
|
168
spacy/cli/debug_model.py
Normal file
168
spacy/cli/debug_model.py
Normal file
|
@ -0,0 +1,168 @@
|
||||||
|
from typing import List
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
|
from .. import util
|
||||||
|
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||||
|
from ..lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("debug-model")
|
||||||
|
def debug_model_cli(
|
||||||
|
# fmt: off
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
|
||||||
|
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||||
|
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
||||||
|
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
||||||
|
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
|
||||||
|
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
|
||||||
|
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
||||||
|
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
||||||
|
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
|
||||||
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||||
|
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Analyze a Thinc ML model - internal structure and activations during training
|
||||||
|
"""
|
||||||
|
print_settings = {
|
||||||
|
"dimensions": dimensions,
|
||||||
|
"parameters": parameters,
|
||||||
|
"gradients": gradients,
|
||||||
|
"attributes": attributes,
|
||||||
|
"layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
|
||||||
|
"print_before_training": P0,
|
||||||
|
"print_after_init": P1,
|
||||||
|
"print_after_training": P2,
|
||||||
|
"print_prediction": P3,
|
||||||
|
}
|
||||||
|
|
||||||
|
if seed is not None:
|
||||||
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
|
fix_random_seed(seed)
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info(f"Using CPU")
|
||||||
|
|
||||||
|
debug_model(
|
||||||
|
config_path,
|
||||||
|
print_settings=print_settings,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def debug_model(
|
||||||
|
config_path: Path,
|
||||||
|
*,
|
||||||
|
print_settings=None
|
||||||
|
):
|
||||||
|
if print_settings is None:
|
||||||
|
print_settings = {}
|
||||||
|
|
||||||
|
model = util.load_config(config_path, create_objects=True)["model"]
|
||||||
|
|
||||||
|
# STEP 0: Printing before training
|
||||||
|
msg.info(f"Analysing model with ID {model.id}")
|
||||||
|
if print_settings.get("print_before_training"):
|
||||||
|
msg.info(f"Before training:")
|
||||||
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
|
# STEP 1: Initializing the model and printing again
|
||||||
|
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
|
||||||
|
if print_settings.get("print_after_init"):
|
||||||
|
msg.info(f"After initialization:")
|
||||||
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
|
# STEP 2: Updating the model and printing again
|
||||||
|
optimizer = Adam(0.001)
|
||||||
|
set_dropout_rate(model, 0.2)
|
||||||
|
for e in range(3):
|
||||||
|
Y, get_dX = model.begin_update(_get_docs())
|
||||||
|
dY = get_gradient(model, Y)
|
||||||
|
_ = get_dX(dY)
|
||||||
|
model.finish_update(optimizer)
|
||||||
|
if print_settings.get("print_after_training"):
|
||||||
|
msg.info(f"After training:")
|
||||||
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
|
# STEP 3: the final prediction
|
||||||
|
prediction = model.predict(_get_docs())
|
||||||
|
if print_settings.get("print_prediction"):
|
||||||
|
msg.info(f"Prediction:", str(prediction))
|
||||||
|
|
||||||
|
|
||||||
|
def get_gradient(model, Y):
|
||||||
|
goldY = _get_output(model.ops.xp)
|
||||||
|
return Y - goldY
|
||||||
|
|
||||||
|
|
||||||
|
def _sentences():
|
||||||
|
return [
|
||||||
|
"Apple is looking at buying U.K. startup for $1 billion",
|
||||||
|
"Autonomous cars shift insurance liability toward manufacturers",
|
||||||
|
"San Francisco considers banning sidewalk delivery robots",
|
||||||
|
"London is a big city in the United Kingdom.",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_docs():
|
||||||
|
nlp = English()
|
||||||
|
return list(nlp.pipe(_sentences()))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_output(xp):
|
||||||
|
return xp.asarray([xp.asarray([i+10, i+20, i+30], dtype="float32") for i, _ in enumerate(_get_docs())])
|
||||||
|
|
||||||
|
|
||||||
|
def _print_model(model, print_settings):
|
||||||
|
layers = print_settings.get("layers", "")
|
||||||
|
parameters = print_settings.get("parameters", False)
|
||||||
|
dimensions = print_settings.get("dimensions", False)
|
||||||
|
gradients = print_settings.get("gradients", False)
|
||||||
|
attributes = print_settings.get("attributes", False)
|
||||||
|
|
||||||
|
for i, node in enumerate(model.walk()):
|
||||||
|
if not layers or i in layers:
|
||||||
|
msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
|
||||||
|
|
||||||
|
if dimensions:
|
||||||
|
for name in node.dim_names:
|
||||||
|
if node.has_dim(name):
|
||||||
|
msg.info(f" - dim {name}: {node.get_dim(name)}")
|
||||||
|
else:
|
||||||
|
msg.info(f" - dim {name}: {node.has_dim(name)}")
|
||||||
|
|
||||||
|
if parameters:
|
||||||
|
for name in node.param_names:
|
||||||
|
if node.has_param(name):
|
||||||
|
print_value = _print_matrix(node.get_param(name))
|
||||||
|
msg.info(f" - param {name}: {print_value}")
|
||||||
|
else:
|
||||||
|
msg.info(f" - param {name}: {node.has_param(name)}")
|
||||||
|
if gradients:
|
||||||
|
for name in node.param_names:
|
||||||
|
if node.has_grad(name):
|
||||||
|
print_value = _print_matrix(node.get_grad(name))
|
||||||
|
msg.info(f" - grad {name}: {print_value}")
|
||||||
|
else:
|
||||||
|
msg.info(f" - grad {name}: {node.has_grad(name)}")
|
||||||
|
if attributes:
|
||||||
|
attrs = node.attrs
|
||||||
|
for name, value in attrs.items():
|
||||||
|
msg.info(f" - attr {name}: {value}")
|
||||||
|
|
||||||
|
|
||||||
|
def _print_matrix(value):
|
||||||
|
if value is None or isinstance(value, bool):
|
||||||
|
return value
|
||||||
|
result = str(value.shape) + " - sample: "
|
||||||
|
sample_matrix = value
|
||||||
|
for d in range(value.ndim-1):
|
||||||
|
sample_matrix = sample_matrix[0]
|
||||||
|
sample_matrix = sample_matrix[0:5]
|
||||||
|
result = result + str(sample_matrix)
|
||||||
|
return result
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Sequence, Union
|
from typing import Optional, Sequence
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_base_version, run_command
|
from ..util import is_package, get_base_version, run_command
|
||||||
|
|
||||||
|
# These are the old shortcuts we previously supported in spacy download. As of
|
||||||
|
# v3, shortcuts are deprecated so we're not expecting to add anything to this
|
||||||
|
# list. It only exists to show users warnings.
|
||||||
|
OLD_SHORTCUTS = {
|
||||||
|
"en": "en_core_web_sm",
|
||||||
|
"de": "de_core_news_sm",
|
||||||
|
"es": "es_core_news_sm",
|
||||||
|
"pt": "pt_core_news_sm",
|
||||||
|
"fr": "fr_core_news_sm",
|
||||||
|
"it": "it_core_news_sm",
|
||||||
|
"nl": "nl_core_news_sm",
|
||||||
|
"el": "el_core_news_sm",
|
||||||
|
"nb": "nb_core_news_sm",
|
||||||
|
"lt": "lt_core_news_sm",
|
||||||
|
"xx": "xx_ent_wiki_sm",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
"download",
|
"download",
|
||||||
|
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
model_name = model
|
||||||
model_name = shortcuts.get(model, model)
|
if model in OLD_SHORTCUTS:
|
||||||
|
msg.warn(
|
||||||
|
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
|
||||||
|
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
|
||||||
|
)
|
||||||
|
model_name = OLD_SHORTCUTS[model]
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
|
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_json(url: str, desc: str) -> Union[dict, list]:
|
def get_compatibility() -> dict:
|
||||||
r = requests.get(url)
|
version = get_base_version(about.__version__)
|
||||||
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Server error ({r.status_code})",
|
f"Server error ({r.status_code})",
|
||||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||||
f"installation (v{about.__version__}), and download it manually. "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
f"For more details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
f"https://spacy.io/usage/models",
|
f"https://spacy.io/usage/models",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return r.json()
|
comp_table = r.json()
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility() -> dict:
|
|
||||||
version = get_base_version(about.__version__)
|
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||||
|
|
|
@ -1,708 +0,0 @@
|
||||||
from typing import List, Dict, Any, Optional, Sequence
|
|
||||||
import typer
|
|
||||||
import srsly
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import subprocess
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
import requests
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
from ._app import app, Arg, Opt, COMMAND, NAME
|
|
||||||
from .. import about
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
|
||||||
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
|
||||||
from ..util import get_hash, get_checksum, split_command
|
|
||||||
|
|
||||||
|
|
||||||
CONFIG_FILE = "project.yml"
|
|
||||||
DVC_CONFIG = "dvc.yaml"
|
|
||||||
DVC_DIR = ".dvc"
|
|
||||||
DIRS = [
|
|
||||||
"assets",
|
|
||||||
"metas",
|
|
||||||
"configs",
|
|
||||||
"packages",
|
|
||||||
"metrics",
|
|
||||||
"scripts",
|
|
||||||
"notebooks",
|
|
||||||
"training",
|
|
||||||
"corpus",
|
|
||||||
]
|
|
||||||
CACHES = [
|
|
||||||
Path.home() / ".torch",
|
|
||||||
Path.home() / ".caches" / "torch",
|
|
||||||
os.environ.get("TORCH_HOME"),
|
|
||||||
Path.home() / ".keras",
|
|
||||||
]
|
|
||||||
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
|
|
||||||
# it directly and edit the project.yml instead and re-run the project."""
|
|
||||||
CLI_HELP = f"""Command-line interface for spaCy projects and working with project
|
|
||||||
templates. You'd typically start by cloning a project template to a local
|
|
||||||
directory and fetching its assets like datasets etc. See the project's
|
|
||||||
{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
|
|
||||||
Version Control) to manage input and output files and to ensure steps are only
|
|
||||||
re-run if their inputs change.
|
|
||||||
"""
|
|
||||||
|
|
||||||
project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.callback(invoke_without_command=True)
|
|
||||||
def callback(ctx: typer.Context):
|
|
||||||
"""This runs before every project command and ensures DVC is installed."""
|
|
||||||
ensure_dvc()
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
# CLI COMMANDS #
|
|
||||||
################
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("clone")
|
|
||||||
def project_clone_cli(
|
|
||||||
# fmt: off
|
|
||||||
name: str = Arg(..., help="The name of the template to fetch"),
|
|
||||||
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
|
||||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
|
||||||
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Clone a project template from a repository. Calls into "git" and will
|
|
||||||
only download the files from the given subdirectory. The GitHub repo
|
|
||||||
defaults to the official spaCy template repo, but can be customized
|
|
||||||
(including using a private repo). Setting the --git flag will also
|
|
||||||
initialize the project directory as a Git repo. If the project is intended
|
|
||||||
to be a Git repo, it should be initialized with Git first, before
|
|
||||||
initializing DVC (Data Version Control). This allows DVC to integrate with
|
|
||||||
Git.
|
|
||||||
"""
|
|
||||||
if dest == Path.cwd():
|
|
||||||
dest = dest / name
|
|
||||||
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("init")
|
|
||||||
def project_init_cli(
|
|
||||||
# fmt: off
|
|
||||||
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Initialize a project directory with DVC and optionally Git. This should
|
|
||||||
typically be taken care of automatically when you run the "project clone"
|
|
||||||
command, but you can also run it separately. If the project is intended to
|
|
||||||
be a Git repo, it should be initialized with Git first, before initializing
|
|
||||||
DVC. This allows DVC to integrate with Git.
|
|
||||||
"""
|
|
||||||
project_init(path, git=git, force=force, silent=True)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
|
||||||
def project_assets_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Use DVC (Data Version Control) to fetch project assets. Assets are
|
|
||||||
defined in the "assets" section of the project config. If possible, DVC
|
|
||||||
will try to track the files so you can pull changes from upstream. It will
|
|
||||||
also try and store the checksum so the assets are versioned. If the file
|
|
||||||
can't be tracked or checked, it will be downloaded without DVC. If a checksum
|
|
||||||
is provided in the project config, the file is only downloaded if no local
|
|
||||||
file with the same checksum exists.
|
|
||||||
"""
|
|
||||||
project_assets(project_dir)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run-all",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_run_all_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context,
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run all commands defined in the project. This command will use DVC and
|
|
||||||
the defined outputs and dependencies in the project config to determine
|
|
||||||
which steps need to be re-run and where to start. This means you're only
|
|
||||||
re-generating data if the inputs have changed.
|
|
||||||
|
|
||||||
This command calls into "dvc repro" and all additional arguments are passed
|
|
||||||
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
|
|
||||||
"""
|
|
||||||
if show_help:
|
|
||||||
print_run_help(project_dir)
|
|
||||||
else:
|
|
||||||
project_run_all(project_dir, *ctx.args)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_run_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context,
|
|
||||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run a named script defined in the project config. If the command is
|
|
||||||
part of the default pipeline defined in the "run" section, DVC is used to
|
|
||||||
determine whether the step should re-run if its inputs have changed, or
|
|
||||||
whether everything is up to date. If the script is not part of the default
|
|
||||||
pipeline, it will be called separately without DVC.
|
|
||||||
|
|
||||||
If DVC is used, the command calls into "dvc repro" and all additional
|
|
||||||
arguments are passed to the "dvc repro" command:
|
|
||||||
https://dvc.org/doc/command-reference/repro
|
|
||||||
"""
|
|
||||||
if show_help or not subcommand:
|
|
||||||
print_run_help(project_dir, subcommand)
|
|
||||||
else:
|
|
||||||
project_run(project_dir, subcommand, *ctx.args)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("exec", hidden=True)
|
|
||||||
def project_exec_cli(
|
|
||||||
# fmt: off
|
|
||||||
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Execute a command defined in the project config. This CLI command is
|
|
||||||
only called internally in auto-generated DVC pipelines, as a shortcut for
|
|
||||||
multi-step commands in the project config. You typically shouldn't have to
|
|
||||||
call it yourself. To run a command, call "run" or "run-all".
|
|
||||||
"""
|
|
||||||
project_exec(project_dir, subcommand)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("update-dvc")
|
|
||||||
def project_update_dvc_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
|
||||||
"run" section of the project config. This typically happens automatically
|
|
||||||
when running a command, but can also be triggered manually if needed.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
|
||||||
else:
|
|
||||||
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
|
||||||
|
|
||||||
|
|
||||||
app.add_typer(project_cli, name="project")
|
|
||||||
|
|
||||||
|
|
||||||
#################
|
|
||||||
# CLI FUNCTIONS #
|
|
||||||
#################
|
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
|
||||||
name: str,
|
|
||||||
dest: Path,
|
|
||||||
*,
|
|
||||||
repo: str = about.__projects__,
|
|
||||||
git: bool = False,
|
|
||||||
no_init: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Clone a project template from a repository.
|
|
||||||
|
|
||||||
name (str): Name of subdirectory to clone.
|
|
||||||
dest (Path): Destination path of cloned project.
|
|
||||||
repo (str): URL of Git repo containing project templates.
|
|
||||||
git (bool): Initialize project as Git repo. Should be set to True if project
|
|
||||||
is intended as a repo, since it will allow DVC to integrate with Git.
|
|
||||||
no_init (bool): Don't initialize DVC and Git automatically. If True, the
|
|
||||||
"init" command or "git init" and "dvc init" need to be run manually.
|
|
||||||
"""
|
|
||||||
dest = ensure_path(dest)
|
|
||||||
check_clone(name, dest, repo)
|
|
||||||
project_dir = dest.resolve()
|
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
|
||||||
try:
|
|
||||||
run_command(cmd)
|
|
||||||
except SystemExit:
|
|
||||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
|
||||||
msg.fail(err)
|
|
||||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
|
||||||
f.write(name)
|
|
||||||
try:
|
|
||||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
|
||||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
|
||||||
except SystemExit:
|
|
||||||
err = f"Could not clone '{name}' in the repo '{repo}'."
|
|
||||||
msg.fail(err)
|
|
||||||
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
|
||||||
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
|
|
||||||
for sub_dir in DIRS:
|
|
||||||
dir_path = project_dir / sub_dir
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir(parents=True)
|
|
||||||
if not no_init:
|
|
||||||
project_init(project_dir, git=git, force=True, silent=True)
|
|
||||||
msg.good(f"Your project is now ready!", dest)
|
|
||||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_init(
|
|
||||||
project_dir: Path,
|
|
||||||
*,
|
|
||||||
git: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
silent: bool = False,
|
|
||||||
analytics: bool = False,
|
|
||||||
):
|
|
||||||
"""Initialize a project as a DVC and (optionally) as a Git repo.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
git (bool): Also call "git init" to initialize directory as a Git repo.
|
|
||||||
silent (bool): Don't print any output (via DVC).
|
|
||||||
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
|
||||||
"""
|
|
||||||
with working_dir(project_dir) as cwd:
|
|
||||||
if git:
|
|
||||||
run_command(["git", "init"])
|
|
||||||
init_cmd = ["dvc", "init"]
|
|
||||||
if silent:
|
|
||||||
init_cmd.append("--quiet")
|
|
||||||
if not git:
|
|
||||||
init_cmd.append("--no-scm")
|
|
||||||
if force:
|
|
||||||
init_cmd.append("--force")
|
|
||||||
run_command(init_cmd)
|
|
||||||
# We don't want to have analytics on by default – our users should
|
|
||||||
# opt-in explicitly. If they want it, they can always enable it.
|
|
||||||
if not analytics:
|
|
||||||
run_command(["dvc", "config", "core.analytics", "false"])
|
|
||||||
# Remove unused and confusing plot templates from .dvc directory
|
|
||||||
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
|
|
||||||
# once you commit your changes via Git and it creates a bunch of files
|
|
||||||
# that have no purpose
|
|
||||||
plots_dir = cwd / DVC_DIR / "plots"
|
|
||||||
if plots_dir.exists():
|
|
||||||
shutil.rmtree(str(plots_dir))
|
|
||||||
config = load_project_config(cwd)
|
|
||||||
setup_check_dvc(cwd, config)
|
|
||||||
|
|
||||||
|
|
||||||
def project_assets(project_dir: Path) -> None:
|
|
||||||
"""Fetch assets for a project using DVC if possible.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
"""
|
|
||||||
project_path = ensure_path(project_dir)
|
|
||||||
config = load_project_config(project_path)
|
|
||||||
setup_check_dvc(project_path, config)
|
|
||||||
assets = config.get("assets", {})
|
|
||||||
if not assets:
|
|
||||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
fetched_assets = []
|
|
||||||
for asset in assets:
|
|
||||||
url = asset["url"].format(**variables)
|
|
||||||
dest = asset["dest"].format(**variables)
|
|
||||||
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
|
|
||||||
if fetched_path:
|
|
||||||
fetched_assets.append(str(fetched_path))
|
|
||||||
if fetched_assets:
|
|
||||||
with working_dir(project_path):
|
|
||||||
run_command(["dvc", "add", *fetched_assets, "--external"])
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_asset(
|
|
||||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
|
||||||
) -> Optional[Path]:
|
|
||||||
"""Fetch an asset from a given URL or path. Will try to import the file
|
|
||||||
using DVC's import-url if possible (fully tracked and versioned) and falls
|
|
||||||
back to get-url (versioned) and a non-DVC download if necessary. If a
|
|
||||||
checksum is provided and a local file exists, it's only re-downloaded if the
|
|
||||||
checksum doesn't match.
|
|
||||||
|
|
||||||
project_path (Path): Path to project directory.
|
|
||||||
url (str): URL or path to asset.
|
|
||||||
checksum (Optional[str]): Optional expected checksum of local file.
|
|
||||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
|
||||||
the asset failed.
|
|
||||||
"""
|
|
||||||
url = convert_asset_url(url)
|
|
||||||
dest_path = (project_path / dest).resolve()
|
|
||||||
if dest_path.exists() and checksum:
|
|
||||||
# If there's already a file, check for checksum
|
|
||||||
# TODO: add support for caches (dvc import-url with local path)
|
|
||||||
if checksum == get_checksum(dest_path):
|
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
|
||||||
return dest_path
|
|
||||||
with working_dir(project_path):
|
|
||||||
try:
|
|
||||||
# If these fail, we don't want to output an error or info message.
|
|
||||||
# Try with tracking the source first, then just downloading with
|
|
||||||
# DVC, then a regular non-DVC download.
|
|
||||||
try:
|
|
||||||
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
try:
|
|
||||||
download_file(url, dest_path)
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
|
||||||
return None
|
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
|
||||||
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
|
||||||
msg.good(f"Fetched asset {dest}")
|
|
||||||
return dest_path
|
|
||||||
|
|
||||||
|
|
||||||
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
|
||||||
"""Run all commands defined in the project using DVC.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
*dvc_args: Other arguments passed to "dvc repro".
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
setup_check_dvc(project_dir, config)
|
|
||||||
dvc_cmd = ["dvc", "repro", *dvc_args]
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_command(dvc_cmd)
|
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|
||||||
"""Simulate a CLI help prompt using the info available in the project config.
|
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
|
||||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
|
||||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
|
||||||
and a list of available commands is printed.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
setup_check_dvc(project_dir, config)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
if subcommand:
|
|
||||||
validate_subcommand(commands.keys(), subcommand)
|
|
||||||
print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
|
|
||||||
help_text = commands[subcommand].get("help")
|
|
||||||
if help_text:
|
|
||||||
msg.text(f"\n{help_text}\n")
|
|
||||||
else:
|
|
||||||
print(f"\nAvailable commands in {CONFIG_FILE}")
|
|
||||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
|
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
|
||||||
msg.text("Run all commands defined in the 'run' block of the project config:")
|
|
||||||
print(f"{COMMAND} project run-all {project_dir}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
|
||||||
"""Run a named script defined in the project config. If the script is part
|
|
||||||
of the default pipeline (defined in the "run" section), DVC is used to
|
|
||||||
execute the command, so it can determine whether to rerun it. It then
|
|
||||||
calls into "exec" to execute it.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
subcommand (str): Name of command to run.
|
|
||||||
*dvc_args: Other arguments passed to "dvc repro".
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
setup_check_dvc(project_dir, config)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
validate_subcommand(commands.keys(), subcommand)
|
|
||||||
if subcommand in config.get("run", []):
|
|
||||||
# This is one of the pipeline commands tracked in DVC
|
|
||||||
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_command(dvc_cmd)
|
|
||||||
else:
|
|
||||||
cmd = commands[subcommand]
|
|
||||||
# Deps in non-DVC commands aren't tracked, but if they're defined,
|
|
||||||
# make sure they exist before running the command
|
|
||||||
for dep in cmd.get("deps", []):
|
|
||||||
if not (project_dir / dep).exists():
|
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_commands(cmd["script"], variables)
|
|
||||||
|
|
||||||
|
|
||||||
def project_exec(project_dir: Path, subcommand: str):
|
|
||||||
"""Execute a command defined in the project config.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
subcommand (str): Name of command to run.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
with working_dir(project_dir):
|
|
||||||
run_commands(commands[subcommand]["script"], variables)
|
|
||||||
|
|
||||||
|
|
||||||
###########
|
|
||||||
# HELPERS #
|
|
||||||
###########
|
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
|
||||||
"""Load the project config file from a directory and validate it.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
RETURNS (Dict[str, Any]): The loaded project config.
|
|
||||||
"""
|
|
||||||
config_path = path / CONFIG_FILE
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail("Can't find project config", config_path, exits=1)
|
|
||||||
invalid_err = f"Invalid project config in {CONFIG_FILE}"
|
|
||||||
try:
|
|
||||||
config = srsly.read_yaml(config_path)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(invalid_err, e, exits=1)
|
|
||||||
errors = validate(ProjectConfigSchema, config)
|
|
||||||
if errors:
|
|
||||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def update_dvc_config(
|
|
||||||
path: Path,
|
|
||||||
config: Dict[str, Any],
|
|
||||||
verbose: bool = False,
|
|
||||||
silent: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
|
||||||
project directory. The file is auto-generated based on the config. The
|
|
||||||
first line of the auto-generated file specifies the hash of the config
|
|
||||||
dict, so if any of the config values change, the DVC config is regenerated.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
config (Dict[str, Any]): The loaded project config.
|
|
||||||
verbose (bool): Whether to print additional info (via DVC).
|
|
||||||
silent (bool): Don't output anything (via DVC).
|
|
||||||
force (bool): Force update, even if hashes match.
|
|
||||||
RETURNS (bool): Whether the DVC config file was updated.
|
|
||||||
"""
|
|
||||||
config_hash = get_hash(config)
|
|
||||||
path = path.resolve()
|
|
||||||
dvc_config_path = path / DVC_CONFIG
|
|
||||||
if dvc_config_path.exists():
|
|
||||||
# Check if the file was generated using the current config, if not, redo
|
|
||||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
|
||||||
ref_hash = f.readline().strip().replace("# ", "")
|
|
||||||
if ref_hash == config_hash and not force:
|
|
||||||
return False # Nothing has changed in project config, don't need to update
|
|
||||||
dvc_config_path.unlink()
|
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = []
|
|
||||||
# We only want to include commands that are part of the main list of "run"
|
|
||||||
# commands in project.yml and should be run in sequence
|
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
|
||||||
for name in config.get("run", []):
|
|
||||||
validate_subcommand(config_commands.keys(), name)
|
|
||||||
command = config_commands[name]
|
|
||||||
deps = command.get("deps", [])
|
|
||||||
outputs = command.get("outputs", [])
|
|
||||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
|
||||||
if not deps and not outputs and not outputs_no_cache:
|
|
||||||
continue
|
|
||||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
|
||||||
# and we don't want arbitrary paths in there
|
|
||||||
project_cmd = ["python", "-m", NAME, "project", "exec", name]
|
|
||||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
|
||||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
|
||||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
|
||||||
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
|
|
||||||
if verbose:
|
|
||||||
dvc_cmd.append("--verbose")
|
|
||||||
if silent:
|
|
||||||
dvc_cmd.append("--quiet")
|
|
||||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
|
||||||
commands.append(" ".join(full_cmd))
|
|
||||||
with working_dir(path):
|
|
||||||
run_commands(commands, variables, silent=True)
|
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
|
||||||
content = f.read()
|
|
||||||
f.seek(0, 0)
|
|
||||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_dvc() -> None:
|
|
||||||
"""Ensure that the "dvc" command is available and show an error if not."""
|
|
||||||
try:
|
|
||||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
|
||||||
except Exception:
|
|
||||||
msg.fail(
|
|
||||||
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
|
|
||||||
"You can install the Python package from pip (pip install dvc) or "
|
|
||||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
|
||||||
"documentation: https://dvc.org/doc/install",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
|
||||||
"""Check that the project is set up correctly with DVC and update its
|
|
||||||
config if needed. Will raise an error if the project is not an initialized
|
|
||||||
DVC project.
|
|
||||||
|
|
||||||
project_dir (Path): The path to the project directory.
|
|
||||||
config (Dict[str, Any]): The loaded project config.
|
|
||||||
"""
|
|
||||||
if not project_dir.exists():
|
|
||||||
msg.fail(f"Can't find project directory: {project_dir}")
|
|
||||||
if not (project_dir / ".dvc").exists():
|
|
||||||
msg.fail(
|
|
||||||
"Project not initialized as a DVC project.",
|
|
||||||
f"Make sure that the project template was cloned correctly. To "
|
|
||||||
f"initialize the project directory manually, you can run: "
|
|
||||||
f"{COMMAND} project init {project_dir}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
with msg.loading("Updating DVC config..."):
|
|
||||||
updated = update_dvc_config(project_dir, config, silent=True)
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
|
||||||
|
|
||||||
|
|
||||||
def run_commands(
|
|
||||||
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
|
||||||
) -> None:
|
|
||||||
"""Run a sequence of commands in a subprocess, in order.
|
|
||||||
|
|
||||||
commands (List[str]): The string commands.
|
|
||||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
|
||||||
values. Will be used to substitute format string variables in the
|
|
||||||
commands.
|
|
||||||
silent (bool): Don't print the commands.
|
|
||||||
"""
|
|
||||||
for command in commands:
|
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
|
||||||
command = command.format(**variables)
|
|
||||||
command = split_command(command)
|
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
|
||||||
# use commands in their config that reference "python" and we want to
|
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
|
||||||
# executed with and the pip in the same env, not some other Python/pip.
|
|
||||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
|
||||||
# that's how it's set up on their system), and user 2 without the
|
|
||||||
# shortcut tries to re-run the command.
|
|
||||||
if len(command) and command[0] in ("python", "python3"):
|
|
||||||
command[0] = sys.executable
|
|
||||||
elif len(command) and command[0] in ("pip", "pip3"):
|
|
||||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
|
||||||
if not silent:
|
|
||||||
print(f"Running command: {' '.join(command)}")
|
|
||||||
run_command(command)
|
|
||||||
|
|
||||||
|
|
||||||
def convert_asset_url(url: str) -> str:
|
|
||||||
"""Check and convert the asset URL if needed.
|
|
||||||
|
|
||||||
url (str): The asset URL.
|
|
||||||
RETURNS (str): The converted URL.
|
|
||||||
"""
|
|
||||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
|
||||||
if re.match("(http(s?)):\/\/github.com", url):
|
|
||||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
|
||||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
|
||||||
msg.warn(
|
|
||||||
"Downloading from a regular GitHub URL. This will only download "
|
|
||||||
"the source of the page, not the actual file. Converting the URL "
|
|
||||||
"to a raw URL.",
|
|
||||||
converted,
|
|
||||||
)
|
|
||||||
return converted
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
|
||||||
"""Check and validate that the destination path can be used to clone. Will
|
|
||||||
check that Git is available and that the destination path is suitable.
|
|
||||||
|
|
||||||
name (str): Name of the directory to clone from the repo.
|
|
||||||
dest (Path): Local destination of cloned directory.
|
|
||||||
repo (str): URL of the repo to clone from.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
|
||||||
except Exception:
|
|
||||||
msg.fail(
|
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
|
||||||
f"To clone a project without Git, copy the files from the '{name}' "
|
|
||||||
f"directory in the {repo} to {dest} manually and then run:",
|
|
||||||
f"{COMMAND} project init {dest}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not dest:
|
|
||||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
|
||||||
if dest.exists():
|
|
||||||
# Directory already exists (not allowed, clone needs to create it)
|
|
||||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
# We're not creating parents, parent dir should exist
|
|
||||||
msg.fail(
|
|
||||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
|
|
||||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
|
||||||
|
|
||||||
commands (Sequence[str]): The available commands.
|
|
||||||
subcommand (str): The subcommand.
|
|
||||||
"""
|
|
||||||
if subcommand not in commands:
|
|
||||||
msg.fail(
|
|
||||||
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
|
|
||||||
f"Available commands: {', '.join(commands)}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
|
||||||
"""Download a file using requests.
|
|
||||||
|
|
||||||
url (str): The URL of the file.
|
|
||||||
dest (Path): The destination path.
|
|
||||||
chunk_size (int): The size of chunks to read/write.
|
|
||||||
"""
|
|
||||||
response = requests.get(url, stream=True)
|
|
||||||
response.raise_for_status()
|
|
||||||
total = int(response.headers.get("content-length", 0))
|
|
||||||
progress_settings = {
|
|
||||||
"total": total,
|
|
||||||
"unit": "iB",
|
|
||||||
"unit_scale": True,
|
|
||||||
"unit_divisor": chunk_size,
|
|
||||||
"leave": False,
|
|
||||||
}
|
|
||||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
|
||||||
for data in response.iter_content(chunk_size=chunk_size):
|
|
||||||
size = f.write(data)
|
|
||||||
bar.update(size)
|
|
0
spacy/cli/project/__init__.py
Normal file
0
spacy/cli/project/__init__.py
Normal file
158
spacy/cli/project/assets.py
Normal file
158
spacy/cli/project/assets.py
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import requests
|
||||||
|
import tqdm
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from ...util import ensure_path, working_dir
|
||||||
|
from .._app import project_cli, Arg
|
||||||
|
from .util import PROJECT_FILE, load_project_config, get_checksum
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: find a solution for caches
|
||||||
|
# CACHES = [
|
||||||
|
# Path.home() / ".torch",
|
||||||
|
# Path.home() / ".caches" / "torch",
|
||||||
|
# os.environ.get("TORCH_HOME"),
|
||||||
|
# Path.home() / ".keras",
|
||||||
|
# ]
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("assets")
|
||||||
|
def project_assets_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||||
|
defined in the "assets" section of the project.yml. If a checksum is
|
||||||
|
provided in the project.yml, the file is only downloaded if no local file
|
||||||
|
with the same checksum exists.
|
||||||
|
"""
|
||||||
|
project_assets(project_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def project_assets(project_dir: Path) -> None:
|
||||||
|
"""Fetch assets for a project using DVC if possible.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
"""
|
||||||
|
project_path = ensure_path(project_dir)
|
||||||
|
config = load_project_config(project_path)
|
||||||
|
assets = config.get("assets", {})
|
||||||
|
if not assets:
|
||||||
|
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||||
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
for asset in assets:
|
||||||
|
dest = asset["dest"].format(**variables)
|
||||||
|
url = asset.get("url")
|
||||||
|
checksum = asset.get("checksum")
|
||||||
|
if not url:
|
||||||
|
# project.yml defines asset without URL that the user has to place
|
||||||
|
check_private_asset(dest, checksum)
|
||||||
|
continue
|
||||||
|
url = url.format(**variables)
|
||||||
|
fetch_asset(project_path, url, dest, checksum)
|
||||||
|
|
||||||
|
|
||||||
|
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||||
|
"""Check and validate assets without a URL (private assets that the user
|
||||||
|
has to provide themselves) and give feedback about the checksum.
|
||||||
|
|
||||||
|
dest (Path): Desintation path of the asset.
|
||||||
|
checksum (Optional[str]): Optional checksum of the expected file.
|
||||||
|
"""
|
||||||
|
if not Path(dest).exists():
|
||||||
|
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||||
|
msg.warn(err)
|
||||||
|
else:
|
||||||
|
if checksum and checksum == get_checksum(dest):
|
||||||
|
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||||
|
else:
|
||||||
|
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_asset(
|
||||||
|
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||||
|
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||||
|
|
||||||
|
project_path (Path): Path to project directory.
|
||||||
|
url (str): URL or path to asset.
|
||||||
|
checksum (Optional[str]): Optional expected checksum of local file.
|
||||||
|
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||||
|
the asset failed.
|
||||||
|
"""
|
||||||
|
# TODO: add support for caches
|
||||||
|
dest_path = (project_path / dest).resolve()
|
||||||
|
if dest_path.exists() and checksum:
|
||||||
|
# If there's already a file, check for checksum
|
||||||
|
if checksum == get_checksum(dest_path):
|
||||||
|
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||||
|
return dest_path
|
||||||
|
# We might as well support the user here and create parent directories in
|
||||||
|
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||||
|
if not dest_path.parent.exists():
|
||||||
|
dest_path.parent.mkdir(parents=True)
|
||||||
|
with working_dir(project_path):
|
||||||
|
url = convert_asset_url(url)
|
||||||
|
try:
|
||||||
|
download_file(url, dest_path)
|
||||||
|
msg.good(f"Downloaded asset {dest}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
if Path(url).exists() and Path(url).is_file():
|
||||||
|
# If it's a local file, copy to destination
|
||||||
|
shutil.copy(url, str(dest_path))
|
||||||
|
msg.good(f"Copied local asset {dest}")
|
||||||
|
else:
|
||||||
|
msg.fail(f"Download failed: {dest}", e)
|
||||||
|
return
|
||||||
|
if checksum and checksum != get_checksum(dest_path):
|
||||||
|
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_asset_url(url: str) -> str:
|
||||||
|
"""Check and convert the asset URL if needed.
|
||||||
|
|
||||||
|
url (str): The asset URL.
|
||||||
|
RETURNS (str): The converted URL.
|
||||||
|
"""
|
||||||
|
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||||
|
if re.match(r"(http(s?)):\/\/github.com", url):
|
||||||
|
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||||
|
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||||
|
msg.warn(
|
||||||
|
"Downloading from a regular GitHub URL. This will only download "
|
||||||
|
"the source of the page, not the actual file. Converting the URL "
|
||||||
|
"to a raw URL.",
|
||||||
|
converted,
|
||||||
|
)
|
||||||
|
return converted
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||||
|
"""Download a file using requests.
|
||||||
|
|
||||||
|
url (str): The URL of the file.
|
||||||
|
dest (Path): The destination path.
|
||||||
|
chunk_size (int): The size of chunks to read/write.
|
||||||
|
"""
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
response.raise_for_status()
|
||||||
|
total = int(response.headers.get("content-length", 0))
|
||||||
|
progress_settings = {
|
||||||
|
"total": total,
|
||||||
|
"unit": "iB",
|
||||||
|
"unit_scale": True,
|
||||||
|
"unit_divisor": chunk_size,
|
||||||
|
"leave": False,
|
||||||
|
}
|
||||||
|
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||||
|
for data in response.iter_content(chunk_size=chunk_size):
|
||||||
|
size = f.write(data)
|
||||||
|
bar.update(size)
|
97
spacy/cli/project/clone.py
Normal file
97
spacy/cli/project/clone.py
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ... import about
|
||||||
|
from ...util import ensure_path, run_command, make_tempdir
|
||||||
|
from .._app import project_cli, Arg, Opt, COMMAND
|
||||||
|
from .util import PROJECT_FILE
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("clone")
|
||||||
|
def project_clone_cli(
|
||||||
|
# fmt: off
|
||||||
|
name: str = Arg(..., help="The name of the template to clone"),
|
||||||
|
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||||
|
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Clone a project template from a repository. Calls into "git" and will
|
||||||
|
only download the files from the given subdirectory. The GitHub repo
|
||||||
|
defaults to the official spaCy template repo, but can be customized
|
||||||
|
(including using a private repo).
|
||||||
|
"""
|
||||||
|
if dest is None:
|
||||||
|
dest = Path.cwd() / name
|
||||||
|
project_clone(name, dest, repo=repo)
|
||||||
|
|
||||||
|
|
||||||
|
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
|
||||||
|
"""Clone a project template from a repository.
|
||||||
|
|
||||||
|
name (str): Name of subdirectory to clone.
|
||||||
|
dest (Path): Destination path of cloned project.
|
||||||
|
repo (str): URL of Git repo containing project templates.
|
||||||
|
"""
|
||||||
|
dest = ensure_path(dest)
|
||||||
|
check_clone(name, dest, repo)
|
||||||
|
project_dir = dest.resolve()
|
||||||
|
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
||||||
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||||
|
try:
|
||||||
|
run_command(cmd)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||||
|
msg.fail(err)
|
||||||
|
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||||
|
f.write(name)
|
||||||
|
try:
|
||||||
|
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||||
|
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||||
|
msg.fail(err)
|
||||||
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
|
shutil.move(str(tmp_dir / Path(name)), str(project_dir))
|
||||||
|
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||||
|
if not (project_dir / PROJECT_FILE).exists():
|
||||||
|
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||||
|
else:
|
||||||
|
msg.good(f"Your project is now ready!")
|
||||||
|
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||||
|
"""Check and validate that the destination path can be used to clone. Will
|
||||||
|
check that Git is available and that the destination path is suitable.
|
||||||
|
|
||||||
|
name (str): Name of the directory to clone from the repo.
|
||||||
|
dest (Path): Local destination of cloned directory.
|
||||||
|
repo (str): URL of the repo to clone from.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||||
|
f"To clone a project without Git, copy the files from the '{name}' "
|
||||||
|
f"directory in the {repo} to {dest} manually and then run:",
|
||||||
|
f"{COMMAND} project init {dest}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not dest:
|
||||||
|
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||||
|
if dest.exists():
|
||||||
|
# Directory already exists (not allowed, clone needs to create it)
|
||||||
|
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||||
|
if not dest.parent.exists():
|
||||||
|
# We're not creating parents, parent dir should exist
|
||||||
|
msg.fail(
|
||||||
|
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||||
|
exits=1,
|
||||||
|
)
|
208
spacy/cli/project/dvc.py
Normal file
208
spacy/cli/project/dvc.py
Normal file
|
@ -0,0 +1,208 @@
|
||||||
|
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||||
|
with Data Version Controk (DVC). https://dvc.org"""
|
||||||
|
from typing import Dict, Any, List, Optional
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from .util import PROJECT_FILE, load_project_config, get_hash
|
||||||
|
from .._app import project_cli, Arg, Opt, NAME, COMMAND
|
||||||
|
from ...util import working_dir, split_command, join_command, run_command
|
||||||
|
|
||||||
|
|
||||||
|
DVC_CONFIG = "dvc.yaml"
|
||||||
|
DVC_DIR = ".dvc"
|
||||||
|
UPDATE_COMMAND = "dvc"
|
||||||
|
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||||
|
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||||
|
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command(UPDATE_COMMAND)
|
||||||
|
def project_update_dvc_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||||
|
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||||
|
project can only define one pipeline, so you need to specify one workflow
|
||||||
|
defined in the project.yml. If no workflow is specified, the first defined
|
||||||
|
workflow is used. The DVC config will only be updated if the project.yml changed.
|
||||||
|
"""
|
||||||
|
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||||
|
|
||||||
|
|
||||||
|
def project_update_dvc(
|
||||||
|
project_dir: Path,
|
||||||
|
workflow: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
verbose: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||||
|
project can only define one pipeline, so you need to specify one workflow
|
||||||
|
defined in the project.yml. Will only update the file if the checksum changed.
|
||||||
|
|
||||||
|
project_dir (Path): The project directory.
|
||||||
|
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||||
|
If not set, the first workflow will be used.
|
||||||
|
verbose (bool): Print more info.
|
||||||
|
force (bool): Force update DVC config.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
updated = update_dvc_config(
|
||||||
|
project_dir, config, workflow, verbose=verbose, force=force
|
||||||
|
)
|
||||||
|
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||||
|
if updated:
|
||||||
|
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||||
|
else:
|
||||||
|
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def update_dvc_config(
|
||||||
|
path: Path,
|
||||||
|
config: Dict[str, Any],
|
||||||
|
workflow: Optional[str] = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
silent: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
) -> bool:
|
||||||
|
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||||
|
project directory. The file is auto-generated based on the config. The
|
||||||
|
first line of the auto-generated file specifies the hash of the config
|
||||||
|
dict, so if any of the config values change, the DVC config is regenerated.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
config (Dict[str, Any]): The loaded project.yml.
|
||||||
|
verbose (bool): Whether to print additional info (via DVC).
|
||||||
|
silent (bool): Don't output anything (via DVC).
|
||||||
|
force (bool): Force update, even if hashes match.
|
||||||
|
RETURNS (bool): Whether the DVC config file was updated.
|
||||||
|
"""
|
||||||
|
ensure_dvc(path)
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
workflow_names = list(workflows.keys())
|
||||||
|
check_workflows(workflow_names, workflow)
|
||||||
|
if not workflow:
|
||||||
|
workflow = workflow_names[0]
|
||||||
|
config_hash = get_hash(config)
|
||||||
|
path = path.resolve()
|
||||||
|
dvc_config_path = path / DVC_CONFIG
|
||||||
|
if dvc_config_path.exists():
|
||||||
|
# Check if the file was generated using the current config, if not, redo
|
||||||
|
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||||
|
ref_hash = f.readline().strip().replace("# ", "")
|
||||||
|
if ref_hash == config_hash and not force:
|
||||||
|
return False # Nothing has changed in project.yml, don't need to update
|
||||||
|
dvc_config_path.unlink()
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
dvc_commands = []
|
||||||
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
for name in workflows[workflow]:
|
||||||
|
command = config_commands[name]
|
||||||
|
deps = command.get("deps", [])
|
||||||
|
outputs = command.get("outputs", [])
|
||||||
|
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||||
|
if not deps and not outputs and not outputs_no_cache:
|
||||||
|
continue
|
||||||
|
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||||
|
# and we don't want arbitrary paths in there
|
||||||
|
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||||
|
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||||
|
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||||
|
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||||
|
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||||
|
if command.get("no_skip"):
|
||||||
|
dvc_cmd.append("--always-changed")
|
||||||
|
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||||
|
dvc_commands.append(join_command(full_cmd))
|
||||||
|
with working_dir(path):
|
||||||
|
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||||
|
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||||
|
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||||
|
content = f.read()
|
||||||
|
f.seek(0, 0)
|
||||||
|
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def run_dvc_commands(
|
||||||
|
commands: List[str] = tuple(),
|
||||||
|
variables: Dict[str, str] = {},
|
||||||
|
flags: Dict[str, bool] = {},
|
||||||
|
) -> None:
|
||||||
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The string commands without the leading "dvc".
|
||||||
|
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||||
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
|
"""
|
||||||
|
for command in commands:
|
||||||
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
|
command = command.format(**variables)
|
||||||
|
command = split_command(command)
|
||||||
|
dvc_command = ["dvc", *command]
|
||||||
|
# Add the flags if they are set to True
|
||||||
|
for flag, is_active in flags.items():
|
||||||
|
if is_active:
|
||||||
|
dvc_command.append(flag)
|
||||||
|
run_command(dvc_command)
|
||||||
|
|
||||||
|
|
||||||
|
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||||
|
"""Validate workflows provided in project.yml and check that a given
|
||||||
|
workflow can be used to generate a DVC config.
|
||||||
|
|
||||||
|
workflows (List[str]): Names of the available workflows.
|
||||||
|
workflow (Optional[str]): The name of the workflow to convert.
|
||||||
|
"""
|
||||||
|
if not workflows:
|
||||||
|
msg.fail(
|
||||||
|
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||||
|
f"define at least one list of commands.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if workflow is not None and workflow not in workflows:
|
||||||
|
msg.fail(
|
||||||
|
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||||
|
f"Available workflows: {', '.join(workflows)}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not workflow:
|
||||||
|
msg.warn(
|
||||||
|
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||||
|
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dvc(project_dir: Path) -> None:
|
||||||
|
"""Ensure that the "dvc" command is available and that the current project
|
||||||
|
directory is an initialized DVC project.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||||
|
"to be installed and the 'dvc' command needs to be available",
|
||||||
|
"You can install the Python package from pip (pip install dvc) or "
|
||||||
|
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||||
|
"documentation: https://dvc.org/doc/install",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not (project_dir / ".dvc").exists():
|
||||||
|
msg.fail(
|
||||||
|
"Project not initialized as a DVC project",
|
||||||
|
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||||
|
"directory. For more details, see the documentation: "
|
||||||
|
"https://dvc.org/doc/command-reference/init",
|
||||||
|
exits=1,
|
||||||
|
)
|
266
spacy/cli/project/run.py
Normal file
266
spacy/cli/project/run.py
Normal file
|
@ -0,0 +1,266 @@
|
||||||
|
from typing import Optional, List, Dict, Sequence, Any
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import sys
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||||
|
from .._app import project_cli, Arg, Opt, COMMAND
|
||||||
|
from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||||
|
from .util import get_checksum
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("run")
|
||||||
|
def project_run_cli(
|
||||||
|
# fmt: off
|
||||||
|
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||||
|
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||||
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Run a named command or workflow defined in the project.yml. If a workflow
|
||||||
|
name is specified, all commands in the workflow are run, in order. If
|
||||||
|
commands define dependencies and/or outputs, they will only be re-run if
|
||||||
|
state has changed.
|
||||||
|
"""
|
||||||
|
if show_help or not subcommand:
|
||||||
|
print_run_help(project_dir, subcommand)
|
||||||
|
else:
|
||||||
|
project_run(project_dir, subcommand, force=force, dry=dry)
|
||||||
|
|
||||||
|
|
||||||
|
def project_run(
|
||||||
|
project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""Run a named script defined in the project.yml. If the script is part
|
||||||
|
of the default pipeline (defined in the "run" section), DVC is used to
|
||||||
|
execute the command, so it can determine whether to rerun it. It then
|
||||||
|
calls into "exec" to execute it.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
subcommand (str): Name of command to run.
|
||||||
|
force (bool): Force re-running, even if nothing changed.
|
||||||
|
dry (bool): Perform a dry run and don't execute commands.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||||
|
if subcommand in workflows:
|
||||||
|
msg.info(f"Running workflow '{subcommand}'")
|
||||||
|
for cmd in workflows[subcommand]:
|
||||||
|
project_run(project_dir, cmd, force=force, dry=dry)
|
||||||
|
else:
|
||||||
|
cmd = commands[subcommand]
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
for dep in cmd.get("deps", []):
|
||||||
|
dep = dep.format(**variables)
|
||||||
|
if not (project_dir / dep).exists():
|
||||||
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
|
msg.fail(err, **err_kwargs)
|
||||||
|
with working_dir(project_dir) as current_dir:
|
||||||
|
rerun = check_rerun(current_dir, cmd, variables)
|
||||||
|
if not rerun and not force:
|
||||||
|
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||||
|
else:
|
||||||
|
msg.divider(subcommand)
|
||||||
|
run_commands(cmd["script"], variables, dry=dry)
|
||||||
|
if not dry:
|
||||||
|
update_lockfile(current_dir, cmd, variables)
|
||||||
|
|
||||||
|
|
||||||
|
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
|
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||||
|
|
||||||
|
project_dir (Path): The project directory.
|
||||||
|
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||||
|
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||||
|
and a list of available commands is printed.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
config_commands = config.get("commands", [])
|
||||||
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||||
|
if subcommand:
|
||||||
|
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||||
|
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||||
|
if subcommand in commands:
|
||||||
|
help_text = commands[subcommand].get("help")
|
||||||
|
if help_text:
|
||||||
|
print(f"\n{help_text}\n")
|
||||||
|
elif subcommand in workflows:
|
||||||
|
steps = workflows[subcommand]
|
||||||
|
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
||||||
|
steps_data = [
|
||||||
|
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
||||||
|
for i, step in enumerate(steps)
|
||||||
|
]
|
||||||
|
msg.table(steps_data)
|
||||||
|
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
||||||
|
print(f"For command details, run: {help_cmd}")
|
||||||
|
else:
|
||||||
|
print("")
|
||||||
|
if config_commands:
|
||||||
|
print(f"Available commands in {PROJECT_FILE}")
|
||||||
|
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||||
|
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||||
|
if workflows:
|
||||||
|
print(f"Available workflows in {PROJECT_FILE}")
|
||||||
|
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
||||||
|
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
||||||
|
|
||||||
|
|
||||||
|
def run_commands(
|
||||||
|
commands: List[str] = tuple(),
|
||||||
|
variables: Dict[str, Any] = {},
|
||||||
|
silent: bool = False,
|
||||||
|
dry: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Run a sequence of commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The string commands.
|
||||||
|
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
silent (bool): Don't print the commands.
|
||||||
|
dry (bool): Perform a dry run and don't execut anything.
|
||||||
|
"""
|
||||||
|
for command in commands:
|
||||||
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
|
command = command.format(**variables)
|
||||||
|
command = split_command(command)
|
||||||
|
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||||
|
# use commands in their config that reference "python" and we want to
|
||||||
|
# make sure that it's always executing the same Python that spaCy is
|
||||||
|
# executed with and the pip in the same env, not some other Python/pip.
|
||||||
|
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||||
|
# that's how it's set up on their system), and user 2 without the
|
||||||
|
# shortcut tries to re-run the command.
|
||||||
|
if len(command) and command[0] in ("python", "python3"):
|
||||||
|
command[0] = sys.executable
|
||||||
|
elif len(command) and command[0] in ("pip", "pip3"):
|
||||||
|
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||||
|
if not silent:
|
||||||
|
print(f"Running command: {join_command(command)}")
|
||||||
|
if not dry:
|
||||||
|
run_command(command)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_subcommand(
|
||||||
|
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||||
|
) -> None:
|
||||||
|
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||||
|
|
||||||
|
commands (Sequence[str]): The available commands.
|
||||||
|
subcommand (str): The subcommand.
|
||||||
|
"""
|
||||||
|
if not commands and not workflows:
|
||||||
|
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||||
|
if subcommand not in commands and subcommand not in workflows:
|
||||||
|
help_msg = []
|
||||||
|
if commands:
|
||||||
|
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||||
|
if workflows:
|
||||||
|
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||||
|
msg.fail(
|
||||||
|
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||||
|
". ".join(help_msg),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def check_rerun(
|
||||||
|
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||||
|
) -> bool:
|
||||||
|
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||||
|
changed.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
RETURNS (bool): Whether to re-run the command.
|
||||||
|
"""
|
||||||
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||||
|
return True
|
||||||
|
data = srsly.read_yaml(lock_path)
|
||||||
|
if command["name"] not in data: # We don't have info about this command
|
||||||
|
return True
|
||||||
|
entry = data[command["name"]]
|
||||||
|
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||||
|
if not entry.get("outs", []):
|
||||||
|
return True
|
||||||
|
# If the entry in the lockfile matches the lockfile entry that would be
|
||||||
|
# generated from the current command, we don't rerun because it means that
|
||||||
|
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||||
|
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lockfile(
|
||||||
|
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
"""Update the lockfile after running a command. Will create a lockfile if
|
||||||
|
it doesn't yet exist and will add an entry for the current command, its
|
||||||
|
script and dependencies/outputs.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
"""
|
||||||
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
if not lock_path.exists():
|
||||||
|
srsly.write_yaml(lock_path, {})
|
||||||
|
data = {}
|
||||||
|
else:
|
||||||
|
data = srsly.read_yaml(lock_path)
|
||||||
|
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
||||||
|
srsly.write_yaml(lock_path, data)
|
||||||
|
|
||||||
|
|
||||||
|
def get_lock_entry(
|
||||||
|
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||||
|
the script (command steps) and a list of dependencies and outputs with
|
||||||
|
their paths and file hashes, if available. The format is based on the
|
||||||
|
dvc.lock files, to keep things consistent.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||||
|
"""
|
||||||
|
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
||||||
|
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
||||||
|
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
||||||
|
return {
|
||||||
|
"cmd": f"{COMMAND} run {command['name']}",
|
||||||
|
"script": command["script"],
|
||||||
|
"deps": deps,
|
||||||
|
"outs": [*outs, *outs_nc],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_fileinfo(
|
||||||
|
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||||
|
Includes the file path and the file's checksum.
|
||||||
|
|
||||||
|
project_dir (Path): The current project directory.
|
||||||
|
paths (List[str]): The file paths.
|
||||||
|
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||||
|
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
for path in paths:
|
||||||
|
path = path.format(**variables)
|
||||||
|
file_path = project_dir / path
|
||||||
|
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||||
|
data.append({"path": path, "md5": md5})
|
||||||
|
return data
|
93
spacy/cli/project/util.py
Normal file
93
spacy/cli/project/util.py
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
from typing import Dict, Any, Union
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import srsly
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from ...schemas import ProjectConfigSchema, validate
|
||||||
|
|
||||||
|
|
||||||
|
PROJECT_FILE = "project.yml"
|
||||||
|
PROJECT_LOCK = "project.lock"
|
||||||
|
|
||||||
|
|
||||||
|
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
|
"""Load the project.yml file from a directory and validate it. Also make
|
||||||
|
sure that all directories defined in the config exist.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||||
|
"""
|
||||||
|
config_path = path / PROJECT_FILE
|
||||||
|
if not config_path.exists():
|
||||||
|
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||||
|
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||||
|
try:
|
||||||
|
config = srsly.read_yaml(config_path)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(invalid_err, e, exits=1)
|
||||||
|
errors = validate(ProjectConfigSchema, config)
|
||||||
|
if errors:
|
||||||
|
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||||
|
validate_project_commands(config)
|
||||||
|
# Make sure directories defined in config exist
|
||||||
|
for subdir in config.get("directories", []):
|
||||||
|
dir_path = path / subdir
|
||||||
|
if not dir_path.exists():
|
||||||
|
dir_path.mkdir(parents=True)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||||
|
workflows = config.get("workflows", {})
|
||||||
|
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||||
|
if duplicates:
|
||||||
|
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
for workflow_name, workflow_steps in workflows.items():
|
||||||
|
if workflow_name in command_names:
|
||||||
|
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
for step in workflow_steps:
|
||||||
|
if step not in command_names:
|
||||||
|
msg.fail(
|
||||||
|
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||||
|
f"Workflows can only refer to commands defined in the 'commands' "
|
||||||
|
f"section of the {PROJECT_FILE}.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_hash(data) -> str:
|
||||||
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
|
data: The data to hash.
|
||||||
|
RETURNS (str): The hash.
|
||||||
|
"""
|
||||||
|
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||||
|
return hashlib.md5(data_str).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_checksum(path: Union[Path, str]) -> str:
|
||||||
|
"""Get the checksum for a file or directory given its file path. If a
|
||||||
|
directory path is provided, this uses all files in that directory.
|
||||||
|
|
||||||
|
path (Union[Path, str]): The file or directory path.
|
||||||
|
RETURNS (str): The checksum.
|
||||||
|
"""
|
||||||
|
path = Path(path)
|
||||||
|
if path.is_file():
|
||||||
|
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||||
|
if path.is_dir():
|
||||||
|
# TODO: this is currently pretty slow
|
||||||
|
dir_checksum = hashlib.md5()
|
||||||
|
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||||
|
dir_checksum.update(sub_file.read_bytes())
|
||||||
|
return dir_checksum.hexdigest()
|
||||||
|
raise ValueError(f"Can't get checksum for {path}: not a file or directory")
|
|
@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
|
||||||
@app.command("train")
|
@app.command("train")
|
||||||
def train_cli(
|
def train_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
train_path: Path = Arg(..., help="Location of training data", exists=True),
|
||||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
dev_path: Path = Arg(..., help="Location of development data", exists=True),
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
|
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
|
||||||
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
|
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
|
||||||
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||||
num_workers: int = Opt(None, "-j", help="Parallel Workers"),
|
num_workers: int = Opt(None, "-j", help="Parallel Workers"),
|
||||||
strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"),
|
strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"),
|
||||||
|
@ -155,6 +155,7 @@ def train_cli(
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
with init_tok2vec.open("rb") as file_:
|
with init_tok2vec.open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
|
|
||||||
train_args = dict(
|
train_args = dict(
|
||||||
config_path=config_path,
|
config_path=config_path,
|
||||||
data_paths={"train": train_path, "dev": dev_path},
|
data_paths={"train": train_path, "dev": dev_path},
|
||||||
|
@ -170,7 +171,7 @@ def train_cli(
|
||||||
distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args)
|
distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args)
|
||||||
else:
|
else:
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info(f"Using GPU: {str(use_gpu)}")
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
require_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
|
@ -191,7 +192,8 @@ def train(
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
# Read the config first without creating objects, to get to the original nlp_config
|
# Read the config first without creating objects, to get to the original nlp_config
|
||||||
config = util.load_config(config_path, create_objects=False)
|
config = util.load_config(config_path, create_objects=False)
|
||||||
fix_random_seed(config["training"]["seed"])
|
if config["training"].get("seed"):
|
||||||
|
fix_random_seed(config["training"]["seed"])
|
||||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||||
# It feels kind of weird to not have a default for this.
|
# It feels kind of weird to not have a default for this.
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
|
@ -216,7 +218,10 @@ def train(
|
||||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
nlp, shuffle=False, gold_preproc=training["gold_preproc"]
|
nlp,
|
||||||
|
shuffle=False,
|
||||||
|
gold_preproc=training["gold_preproc"],
|
||||||
|
max_length=training["max_length"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
nlp.begin_training(lambda: train_examples)
|
nlp.begin_training(lambda: train_examples)
|
||||||
|
@ -315,6 +320,7 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
|
||||||
)
|
)
|
||||||
|
|
||||||
epoch = 0
|
epoch = 0
|
||||||
|
batch_strategy = cfg.get("batch_by", "sequences")
|
||||||
while True:
|
while True:
|
||||||
if len(train_examples) == 0:
|
if len(train_examples) == 0:
|
||||||
raise ValueError(Errors.E988)
|
raise ValueError(Errors.E988)
|
||||||
|
@ -324,11 +330,22 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
|
||||||
random.random()
|
random.random()
|
||||||
random.shuffle(train_examples)
|
random.shuffle(train_examples)
|
||||||
epoch += 1
|
epoch += 1
|
||||||
batches = util.minibatch_by_words(
|
if batch_strategy == "padded":
|
||||||
train_examples,
|
batches = util.minibatch_by_padded_size(
|
||||||
size=cfg["batch_size"],
|
train_examples,
|
||||||
discard_oversize=cfg["discard_oversize"],
|
size=cfg["batch_size"],
|
||||||
)
|
buffer=256,
|
||||||
|
discard_oversize=cfg["discard_oversize"],
|
||||||
|
)
|
||||||
|
elif batch_strategy == "words":
|
||||||
|
batches = util.minibatch_by_words(
|
||||||
|
train_examples,
|
||||||
|
size=cfg["batch_size"],
|
||||||
|
discard_oversize=cfg["discard_oversize"],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
batches = util.minibatch(train_examples, size=cfg["batch_size"])
|
||||||
|
|
||||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||||
try:
|
try:
|
||||||
first = next(batches)
|
first = next(batches)
|
||||||
|
@ -440,7 +457,9 @@ def train_while_improving(
|
||||||
|
|
||||||
if raw_text:
|
if raw_text:
|
||||||
random.shuffle(raw_text)
|
random.shuffle(raw_text)
|
||||||
raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text]
|
raw_examples = [
|
||||||
|
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
||||||
|
]
|
||||||
raw_batches = util.minibatch(raw_examples, size=8)
|
raw_batches = util.minibatch(raw_examples, size=8)
|
||||||
|
|
||||||
for step, (epoch, batch) in enumerate(train_data):
|
for step, (epoch, batch) in enumerate(train_data):
|
||||||
|
|
|
@ -69,6 +69,9 @@ class Warnings(object):
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
"smaller JSON files instead.")
|
"smaller JSON files instead.")
|
||||||
|
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||||
|
"but is expecting one of type 'uint64' instead. This may result "
|
||||||
|
"in problems with the vocab further on in the pipeline.")
|
||||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
"entities \"{entities}\". Use "
|
"entities \"{entities}\". Use "
|
||||||
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||||
|
@ -477,15 +480,14 @@ class Errors(object):
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||||
"array and {doc_length} for the Doc itself.")
|
"array and {doc_length} for the Doc itself.")
|
||||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||||
E973 = ("Unexpected type for NER data")
|
E973 = ("Unexpected type for NER data")
|
||||||
E974 = ("Unknown {obj} attribute: {key}")
|
E974 = ("Unknown {obj} attribute: {key}")
|
||||||
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
|
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||||
"but got {type}")
|
|
||||||
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
|
|
||||||
"but received None.")
|
"but received None.")
|
||||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
|
|
|
@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||||
if reference is None:
|
if reference is None:
|
||||||
|
@ -37,6 +36,9 @@ cdef class Example:
|
||||||
self.y = reference
|
self.y = reference
|
||||||
self._alignment = alignment
|
self._alignment = alignment
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.predicted)
|
||||||
|
|
||||||
property predicted:
|
property predicted:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.x
|
return self.x
|
||||||
|
@ -59,17 +61,15 @@ cdef class Example:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, Doc predicted, dict example_dict):
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
|
if predicted is None:
|
||||||
|
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||||
if example_dict is None:
|
if example_dict is None:
|
||||||
raise ValueError(Errors.E976)
|
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||||
if not isinstance(predicted, Doc):
|
|
||||||
raise TypeError(Errors.E975.format(type=type(predicted)))
|
|
||||||
example_dict = _fix_legacy_dict_data(example_dict)
|
example_dict = _fix_legacy_dict_data(example_dict)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
if not _has_field(tok_dict, "SPACY"):
|
|
||||||
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
|
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
|
@ -257,7 +257,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
try:
|
||||||
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
|
except TypeError:
|
||||||
|
types= set([type(v) for v in value])
|
||||||
|
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||||
|
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
@ -325,8 +329,8 @@ def _fix_legacy_dict_data(example_dict):
|
||||||
for key, value in old_token_dict.items():
|
for key, value in old_token_dict.items():
|
||||||
if key in ("text", "ids", "brackets"):
|
if key in ("text", "ids", "brackets"):
|
||||||
pass
|
pass
|
||||||
elif key in remapping:
|
elif key.lower() in remapping:
|
||||||
token_dict[remapping[key]] = value
|
token_dict[remapping[key.lower()]] = value
|
||||||
else:
|
else:
|
||||||
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
|
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
|
||||||
text = example_dict.get("text", example_dict.get("raw"))
|
text = example_dict.get("text", example_dict.get("raw"))
|
||||||
|
|
|
@ -513,20 +513,23 @@ class Language(object):
|
||||||
):
|
):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
examples (iterable): A batch of `Example` objects.
|
examples (Iterable[Example]): A batch of examples
|
||||||
dummy: Should not be set - serves to catch backwards-incompatible scripts.
|
dummy: Should not be set - serves to catch backwards-incompatible scripts.
|
||||||
drop (float): The dropout rate.
|
drop (float): The dropout rate.
|
||||||
sgd (callable): An optimizer.
|
sgd (Optimizer): An optimizer.
|
||||||
losses (dict): Dictionary to update with the loss, keyed by component.
|
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
||||||
component_cfg (dict): Config parameters for specific pipeline
|
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||||
components, keyed by component name.
|
components, keyed by component name.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://spacy.io/api/language#update
|
||||||
"""
|
"""
|
||||||
if dummy is not None:
|
if dummy is not None:
|
||||||
raise ValueError(Errors.E989)
|
raise ValueError(Errors.E989)
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return losses
|
||||||
if not isinstance(examples, Iterable):
|
if not isinstance(examples, Iterable):
|
||||||
raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples)))
|
raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples)))
|
||||||
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
|
||||||
|
@ -540,22 +543,19 @@ class Language(object):
|
||||||
|
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
component_deps = count_pipeline_interdependencies(self.pipeline)
|
|
||||||
# Determine whether component should set annotations. In theory I guess
|
|
||||||
# we should do this by inspecting the meta? Or we could just always
|
|
||||||
# say "yes"
|
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
component_cfg.setdefault(name, {})
|
component_cfg.setdefault(name, {})
|
||||||
component_cfg[name].setdefault("drop", drop)
|
component_cfg[name].setdefault("drop", drop)
|
||||||
component_cfg[name]["set_annotations"] = bool(component_deps[i])
|
component_cfg[name].setdefault("set_annotations", False)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, "update"):
|
if not hasattr(proc, "update"):
|
||||||
continue
|
continue
|
||||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||||
if sgd is not False:
|
if sgd not in (None, False):
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "model"):
|
if hasattr(proc, "model"):
|
||||||
proc.model.finish_update(sgd)
|
proc.model.finish_update(sgd)
|
||||||
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, config=None):
|
def rehearse(self, examples, sgd=None, losses=None, config=None):
|
||||||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||||||
|
@ -761,18 +761,17 @@ class Language(object):
|
||||||
):
|
):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
|
|
||||||
texts (iterator): A sequence of texts to process.
|
texts (Iterable[str]): A sequence of texts to process.
|
||||||
as_tuples (bool): If set to True, inputs should be a sequence of
|
as_tuples (bool): If set to True, inputs should be a sequence of
|
||||||
(text, context) tuples. Output will then be a sequence of
|
(text, context) tuples. Output will then be a sequence of
|
||||||
(doc, context) tuples. Defaults to False.
|
(doc, context) tuples. Defaults to False.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (int): The number of texts to buffer.
|
||||||
disable (list): Names of the pipeline components to disable.
|
disable (List[str]): Names of the pipeline components to disable.
|
||||||
cleanup (bool): If True, unneeded strings are freed to control memory
|
cleanup (bool): If True, unneeded strings are freed to control memory
|
||||||
use. Experimental.
|
use. Experimental.
|
||||||
component_cfg (dict): An optional dictionary with extra keyword
|
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||||||
arguments for specific components.
|
arguments for specific components.
|
||||||
n_process (int): Number of processors to process texts, only supported
|
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||||||
in Python3. If -1, set `multiprocessing.cpu_count()`.
|
|
||||||
YIELDS (Doc): Documents in the order of the original text.
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#pipe
|
DOCS: https://spacy.io/api/language#pipe
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from thinc.api import Model, normal_init
|
from thinc.api import Model, normal_init
|
||||||
|
|
||||||
|
|
||||||
def PrecomputableAffine(nO, nI, nF, nP):
|
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||||
model = Model(
|
model = Model(
|
||||||
"precomputable_affine",
|
"precomputable_affine",
|
||||||
forward,
|
forward,
|
||||||
init=init,
|
init=init,
|
||||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||||
params={"W": None, "b": None, "pad": None},
|
params={"W": None, "b": None, "pad": None},
|
||||||
|
attrs={"dropout_rate": dropout}
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@ -48,17 +49,14 @@ def forward(model, X, is_train):
|
||||||
model.inc_grad("b", dY.sum(axis=0))
|
model.inc_grad("b", dY.sum(axis=0))
|
||||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||||
|
|
||||||
Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
|
Wopfi = W.transpose((1, 2, 0, 3))
|
||||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||||
|
|
||||||
# Reuse the buffer
|
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||||
dWopfi = Wopfi
|
|
||||||
dWopfi.fill(0.0)
|
|
||||||
model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
|
||||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
# (o, p, f, i) --> (f, o, p, i)
|
||||||
dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
|
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||||
model.inc_grad("W", dWopfi)
|
model.inc_grad("W", dWopfi)
|
||||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||||
|
|
||||||
|
|
|
@ -87,16 +87,16 @@ def build_text_classifier(
|
||||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
lower = HashEmbed(
|
lower = HashEmbed(
|
||||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
|
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
||||||
)
|
)
|
||||||
prefix = HashEmbed(
|
prefix = HashEmbed(
|
||||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
|
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
|
||||||
)
|
)
|
||||||
suffix = HashEmbed(
|
suffix = HashEmbed(
|
||||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
|
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
|
||||||
)
|
)
|
||||||
shape = HashEmbed(
|
shape = HashEmbed(
|
||||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
|
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
|
||||||
)
|
)
|
||||||
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||||
|
|
|
@ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces):
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
|
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
|
||||||
):
|
):
|
||||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6)
|
||||||
if use_subwords:
|
if use_subwords:
|
||||||
prefix = HashEmbed(
|
prefix = HashEmbed(
|
||||||
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
|
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7
|
||||||
)
|
)
|
||||||
suffix = HashEmbed(
|
suffix = HashEmbed(
|
||||||
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
|
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8
|
||||||
)
|
)
|
||||||
shape = HashEmbed(
|
shape = HashEmbed(
|
||||||
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
|
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_vectors:
|
if pretrained_vectors:
|
||||||
|
@ -192,7 +192,7 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
||||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5)
|
||||||
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
|
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
embed_layer = chr_embed | features >> with_array(norm)
|
embed_layer = chr_embed | features >> with_array(norm)
|
||||||
|
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
norm = HashEmbed(
|
norm = HashEmbed(
|
||||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
|
nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
|
||||||
seed=0
|
seed=0
|
||||||
)
|
)
|
||||||
if subword_features:
|
if subword_features:
|
||||||
prefix = HashEmbed(
|
prefix = HashEmbed(
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
|
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
|
||||||
seed=1
|
seed=1
|
||||||
)
|
)
|
||||||
suffix = HashEmbed(
|
suffix = HashEmbed(
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
|
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
|
||||||
seed=2
|
seed=2
|
||||||
)
|
)
|
||||||
shape = HashEmbed(
|
shape = HashEmbed(
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
|
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
|
||||||
seed=3
|
seed=3
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
|
||||||
>> Maxout(
|
>> Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=width * columns,
|
nI=width * columns,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
),
|
),
|
||||||
|
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
|
||||||
>> Maxout(
|
>> Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=width * columns,
|
nI=width * columns,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
),
|
),
|
||||||
|
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
|
||||||
>> Maxout(
|
>> Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=width * columns,
|
nI=width * columns,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
),
|
),
|
||||||
|
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
|
||||||
reduce_dimensions = Maxout(
|
reduce_dimensions = Maxout(
|
||||||
nO=width,
|
nO=width,
|
||||||
nI=nM * nC + width,
|
nI=nM * nC + width,
|
||||||
nP=maxout_pieces,
|
nP=3,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
|
||||||
from ..syntax._parser_model import ParserStepModel
|
from ..syntax._parser_model import ParserStepModel
|
||||||
|
|
||||||
|
|
||||||
def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
||||||
"""Set up a stepwise transition-based model"""
|
"""Set up a stepwise transition-based model"""
|
||||||
if upper is None:
|
if upper is None:
|
||||||
has_upper = False
|
has_upper = False
|
||||||
|
|
|
@ -272,7 +272,7 @@ cdef class Morphology:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def feats_to_dict(feats):
|
def feats_to_dict(feats):
|
||||||
if not feats:
|
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||||
return {}
|
return {}
|
||||||
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
||||||
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
||||||
|
|
|
@ -3,7 +3,7 @@ cimport numpy as np
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import to_categorical
|
from thinc.api import SequenceCategoricalCrossentropy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
|
||||||
doc.is_morphed = True
|
doc.is_morphed = True
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
truths = []
|
||||||
cdef int idx = 0
|
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
|
||||||
guesses = scores.argmax(axis=1)
|
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
eg_truths = []
|
||||||
pos_tags = eg.get_aligned("POS", as_string=True)
|
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||||
morphs = eg.get_aligned("MORPH", as_string=True)
|
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||||
for i in range(len(morphs)):
|
for i in range(len(morphs)):
|
||||||
|
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
|
||||||
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
||||||
if morph == "":
|
if morph == "":
|
||||||
morph = Morphology.EMPTY_MORPH
|
morph = Morphology.EMPTY_MORPH
|
||||||
if morph is None:
|
eg_truths.append(morph)
|
||||||
correct[idx] = guesses[idx]
|
truths.append(eg_truths)
|
||||||
elif morph in tag_index:
|
d_scores, loss = loss_func(scores, truths)
|
||||||
correct[idx] = tag_index[morph]
|
if self.model.ops.xp.isnan(loss):
|
||||||
else:
|
raise ValueError("nan value when computing loss")
|
||||||
correct[idx] = 0
|
|
||||||
known_labels[idx] = 0.
|
|
||||||
idx += 1
|
|
||||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
|
||||||
loss = (d_scores**2).sum()
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
|
|
@ -58,12 +58,8 @@ class Pipe(object):
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
predictions = self.predict([doc])
|
scores = self.predict([doc])
|
||||||
if isinstance(predictions, tuple) and len(predictions) == 2:
|
self.set_annotations([doc], scores)
|
||||||
scores, tensors = predictions
|
|
||||||
self.set_annotations([doc], scores, tensors=tensors)
|
|
||||||
else:
|
|
||||||
self.set_annotations([doc], predictions)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128):
|
def pipe(self, stream, batch_size=128):
|
||||||
|
@ -73,12 +69,8 @@ class Pipe(object):
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
predictions = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
self.set_annotations(docs, scores)
|
||||||
scores, tensors = predictions
|
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
|
||||||
else:
|
|
||||||
self.set_annotations(docs, predictions)
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -87,7 +79,7 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def set_annotations(self, docs, scores):
|
||||||
"""Modify a batch of documents, using pre-computed scores."""
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -281,9 +273,10 @@ class Tagger(Pipe):
|
||||||
idx += 1
|
idx += 1
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||||
if losses is not None and self.name not in losses:
|
if losses is None:
|
||||||
losses[self.name] = 0.
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
|
@ -303,11 +296,11 @@ class Tagger(Pipe):
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
|
|
||||||
if losses is not None:
|
losses[self.name] += loss
|
||||||
losses[self.name] += loss
|
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
||||||
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
"""Perform a 'rehearsal' update, where we try to match the output of
|
||||||
|
@ -334,7 +327,7 @@ class Tagger(Pipe):
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
@ -521,29 +514,23 @@ class SentenceRecognizer(Tagger):
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
labels = self.labels
|
||||||
tag_index = range(len(self.labels))
|
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||||
cdef int idx = 0
|
truths = []
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
|
||||||
guesses = scores.argmax(axis=1)
|
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
sent_starts = eg.get_aligned("sent_start")
|
eg_truth = []
|
||||||
for sent_start in sent_starts:
|
for x in eg.get_aligned("sent_start"):
|
||||||
if sent_start is None:
|
if x == None:
|
||||||
correct[idx] = guesses[idx]
|
eg_truth.append(None)
|
||||||
elif sent_start in tag_index:
|
elif x == 1:
|
||||||
correct[idx] = sent_start
|
eg_truth.append(labels[1])
|
||||||
else:
|
else:
|
||||||
correct[idx] = 0
|
# anything other than 1: 0, -1, -1 as uint64
|
||||||
known_labels[idx] = 0.
|
eg_truth.append(labels[0])
|
||||||
idx += 1
|
truths.append(eg_truth)
|
||||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
d_scores, loss = loss_func(scores, truths)
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
if self.model.ops.xp.isnan(loss):
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
raise ValueError("nan value when computing loss")
|
||||||
loss = (d_scores**2).sum()
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
@ -641,7 +628,7 @@ class MultitaskObjective(Tagger):
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg["labels"] = value
|
self.cfg["labels"] = value
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||||
|
@ -738,7 +725,7 @@ class ClozeMultitask(Pipe):
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
|
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||||
|
@ -767,7 +754,7 @@ class ClozeMultitask(Pipe):
|
||||||
loss = self.distance.get_loss(prediction, target)
|
loss = self.distance.get_loss(prediction, target)
|
||||||
return loss, gradient
|
return loss, gradient
|
||||||
|
|
||||||
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
|
@ -815,8 +802,8 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores, tensors = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -826,22 +813,25 @@ class TextCategorizer(Pipe):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
xp = get_array_module(tensors)
|
xp = get_array_module(tensors)
|
||||||
scores = xp.zeros((len(docs), len(self.labels)))
|
scores = xp.zeros((len(docs), len(self.labels)))
|
||||||
return scores, tensors
|
return scores
|
||||||
|
|
||||||
scores = self.model.predict(docs)
|
scores = self.model.predict(docs)
|
||||||
scores = self.model.ops.asarray(scores)
|
scores = self.model.ops.asarray(scores)
|
||||||
return scores, tensors
|
return scores
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def set_annotations(self, docs, scores):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
try:
|
try:
|
||||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return losses
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
types = set([type(eg) for eg in examples])
|
types = set([type(eg) for eg in examples])
|
||||||
raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
|
raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
|
||||||
|
@ -853,12 +843,11 @@ class TextCategorizer(Pipe):
|
||||||
bp_scores(d_scores)
|
bp_scores(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
if losses is not None:
|
losses[self.name] += loss
|
||||||
losses.setdefault(self.name, 0.0)
|
|
||||||
losses[self.name] += loss
|
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, scores=scores)
|
self.set_annotations(docs, scores=scores)
|
||||||
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
|
@ -1082,12 +1071,13 @@ class EntityLinker(Pipe):
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
if losses is not None:
|
if losses is None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return 0
|
return losses
|
||||||
sentence_docs = []
|
sentence_docs = []
|
||||||
try:
|
try:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -1130,20 +1120,19 @@ class EntityLinker(Pipe):
|
||||||
return 0.0
|
return 0.0
|
||||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||||
loss, d_scores = self.get_similarity_loss(
|
loss, d_scores = self.get_similarity_loss(
|
||||||
scores=sentence_encodings,
|
sentence_encodings=sentence_encodings,
|
||||||
examples=examples
|
examples=examples
|
||||||
)
|
)
|
||||||
bp_context(d_scores)
|
bp_context(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
|
|
||||||
if losses is not None:
|
losses[self.name] += loss
|
||||||
losses[self.name] += loss
|
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
return loss
|
return losses
|
||||||
|
|
||||||
def get_similarity_loss(self, examples, scores):
|
def get_similarity_loss(self, examples, sentence_encodings):
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
@ -1155,41 +1144,23 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||||
|
|
||||||
if scores.shape != entity_encodings.shape:
|
if sentence_encodings.shape != entity_encodings.shape:
|
||||||
raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
|
raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
|
||||||
|
|
||||||
gradients = self.distance.get_grad(scores, entity_encodings)
|
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
|
||||||
loss = self.distance.get_loss(scores, entity_encodings)
|
loss = self.distance.get_loss(sentence_encodings, entity_encodings)
|
||||||
loss = loss / len(entity_encodings)
|
loss = loss / len(entity_encodings)
|
||||||
return loss, gradients
|
return loss, gradients
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
|
||||||
cats = []
|
|
||||||
for eg in examples:
|
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
|
||||||
for ent in eg.predicted.ents:
|
|
||||||
kb_id = kb_ids[ent.start]
|
|
||||||
if kb_id:
|
|
||||||
cats.append([1.0])
|
|
||||||
|
|
||||||
cats = self.model.ops.asarray(cats, dtype="float32")
|
|
||||||
if len(scores) != len(cats):
|
|
||||||
raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
|
|
||||||
|
|
||||||
d_scores = (scores - cats)
|
|
||||||
loss = (d_scores ** 2).sum()
|
|
||||||
loss = loss / len(cats)
|
|
||||||
return loss, d_scores
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
kb_ids, tensors = self.predict([doc])
|
kb_ids = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
self.set_annotations([doc], kb_ids)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
kb_ids, tensors = self.predict(docs)
|
kb_ids = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
self.set_annotations(docs, kb_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -1197,10 +1168,9 @@ class EntityLinker(Pipe):
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
entity_count = 0
|
entity_count = 0
|
||||||
final_kb_ids = []
|
final_kb_ids = []
|
||||||
final_tensors = []
|
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
return final_kb_ids, final_tensors
|
return final_kb_ids
|
||||||
|
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -1234,21 +1204,18 @@ class EntityLinker(Pipe):
|
||||||
if to_discard and ent.label_ in to_discard:
|
if to_discard and ent.label_ in to_discard:
|
||||||
# ignoring this entity - setting to NIL
|
# ignoring this entity - setting to NIL
|
||||||
final_kb_ids.append(self.NIL)
|
final_kb_ids.append(self.NIL)
|
||||||
final_tensors.append(sentence_encoding)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
candidates = self.kb.get_candidates(ent.text)
|
candidates = self.kb.get_candidates(ent.text)
|
||||||
if not candidates:
|
if not candidates:
|
||||||
# no prediction possible for this entity - setting to NIL
|
# no prediction possible for this entity - setting to NIL
|
||||||
final_kb_ids.append(self.NIL)
|
final_kb_ids.append(self.NIL)
|
||||||
final_tensors.append(sentence_encoding)
|
|
||||||
|
|
||||||
elif len(candidates) == 1:
|
elif len(candidates) == 1:
|
||||||
# shortcut for efficiency reasons: take the 1 candidate
|
# shortcut for efficiency reasons: take the 1 candidate
|
||||||
|
|
||||||
# TODO: thresholding
|
# TODO: thresholding
|
||||||
final_kb_ids.append(candidates[0].entity_)
|
final_kb_ids.append(candidates[0].entity_)
|
||||||
final_tensors.append(sentence_encoding)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
random.shuffle(candidates)
|
random.shuffle(candidates)
|
||||||
|
@ -1277,14 +1244,13 @@ class EntityLinker(Pipe):
|
||||||
best_index = scores.argmax().item()
|
best_index = scores.argmax().item()
|
||||||
best_candidate = candidates[best_index]
|
best_candidate = candidates[best_index]
|
||||||
final_kb_ids.append(best_candidate.entity_)
|
final_kb_ids.append(best_candidate.entity_)
|
||||||
final_tensors.append(sentence_encoding)
|
|
||||||
|
|
||||||
if not (len(final_tensors) == len(final_kb_ids) == entity_count):
|
if not (len(final_kb_ids) == entity_count):
|
||||||
raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
|
raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
|
||||||
|
|
||||||
return final_kb_ids, final_tensors
|
return final_kb_ids
|
||||||
|
|
||||||
def set_annotations(self, docs, kb_ids, tensors=None):
|
def set_annotations(self, docs, kb_ids):
|
||||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||||
if count_ents != len(kb_ids):
|
if count_ents != len(kb_ids):
|
||||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||||
|
@ -1400,11 +1366,7 @@ class Sentencizer(Pipe):
|
||||||
def pipe(self, stream, batch_size=128):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
self.set_annotations(docs, predictions)
|
||||||
scores, tensors = predictions
|
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
|
||||||
else:
|
|
||||||
self.set_annotations(docs, predictions)
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -1435,7 +1397,7 @@ class Sentencizer(Pipe):
|
||||||
guesses.append(doc_guesses)
|
guesses.append(doc_guesses)
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
|
|
@ -57,7 +57,7 @@ class SimpleNER(Pipe):
|
||||||
scores = self.model.predict(docs)
|
scores = self.model.predict(docs)
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None):
|
def set_annotations(self, docs: List[Doc], scores: List[Floats2d]):
|
||||||
"""Set entities on a batch of documents from a batch of scores."""
|
"""Set entities on a batch of documents from a batch of scores."""
|
||||||
tag_names = self.get_tag_names()
|
tag_names = self.get_tag_names()
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
@ -67,9 +67,12 @@ class SimpleNER(Pipe):
|
||||||
tags = iob_to_biluo(tags)
|
tags = iob_to_biluo(tags)
|
||||||
doc.ents = spans_from_biluo_tags(doc, tags)
|
doc.ents = spans_from_biluo_tags(doc, tags)
|
||||||
|
|
||||||
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault("ner", 0.0)
|
||||||
if not any(_has_ner(eg) for eg in examples):
|
if not any(_has_ner(eg) for eg in examples):
|
||||||
return 0
|
return losses
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update(docs)
|
scores, bp_scores = self.model.begin_update(docs)
|
||||||
|
@ -79,10 +82,8 @@ class SimpleNER(Pipe):
|
||||||
self.set_annotations(docs, scores)
|
self.set_annotations(docs, scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
if losses is not None:
|
losses["ner"] += loss
|
||||||
losses.setdefault("ner", 0.0)
|
return losses
|
||||||
losses["ner"] += loss
|
|
||||||
return loss
|
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
loss = 0
|
loss = 0
|
||||||
|
|
|
@ -83,12 +83,14 @@ class Tok2Vec(Pipe):
|
||||||
assert tokvecs.shape[0] == len(doc)
|
assert tokvecs.shape[0] == len(doc)
|
||||||
doc.tensor = tokvecs
|
doc.tensor = tokvecs
|
||||||
|
|
||||||
def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False):
|
||||||
"""Update the model.
|
"""Update the model.
|
||||||
examples (iterable): A batch of examples
|
examples (Iterable[Example]): A batch of examples
|
||||||
drop (float): The droput rate.
|
drop (float): The droput rate.
|
||||||
sgd (callable): An optimizer.
|
sgd (Optimizer): An optimizer.
|
||||||
RETURNS (dict): Results from the update.
|
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
||||||
|
set_annotations (bool): whether or not to update the examples with the predictions
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -124,6 +126,7 @@ class Tok2Vec(Pipe):
|
||||||
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
self.set_annotations(docs, tokvecs)
|
self.set_annotations(docs, tokvecs)
|
||||||
|
return losses
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
|
||||||
class ProjectConfigAsset(BaseModel):
|
class ProjectConfigAsset(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
url: StrictStr = Field(..., title="URL of asset")
|
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel):
|
||||||
name: StrictStr = Field(..., title="Name of command")
|
name: StrictStr = Field(..., title="Name of command")
|
||||||
help: Optional[StrictStr] = Field(None, title="Command description")
|
help: Optional[StrictStr] = Field(None, title="Command description")
|
||||||
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
|
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
|
||||||
deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
|
deps: List[StrictStr] = Field([], title="File dependencies required by this command")
|
||||||
outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
|
outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
|
||||||
outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
|
outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
|
||||||
|
no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
@ -246,7 +247,7 @@ class ProjectConfigSchema(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
||||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||||
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
||||||
|
|
||||||
|
|
||||||
class ParserStepModel(Model):
|
class ParserStepModel(Model):
|
||||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
|
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||||
|
dropout=0.1):
|
||||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||||
self.attrs["has_upper"] = has_upper
|
self.attrs["has_upper"] = has_upper
|
||||||
|
self.attrs["dropout_rate"] = dropout
|
||||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||||
if layers[1].get_dim("nP") >= 2:
|
if layers[1].get_dim("nP") >= 2:
|
||||||
activation = "maxout"
|
activation = "maxout"
|
||||||
|
@ -243,6 +245,13 @@ class ParserStepModel(Model):
|
||||||
for class_ in unseen_classes:
|
for class_ in unseen_classes:
|
||||||
self._class_mask[class_] = 0.
|
self._class_mask[class_] = 0.
|
||||||
|
|
||||||
|
def clear_memory(self):
|
||||||
|
del self.tokvecs
|
||||||
|
del self.bp_tokvecs
|
||||||
|
del self.state2vec
|
||||||
|
del self.backprops
|
||||||
|
del self._class_mask
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def nO(self):
|
def nO(self):
|
||||||
if self.attrs["has_upper"]:
|
if self.attrs["has_upper"]:
|
||||||
|
@ -271,6 +280,19 @@ class ParserStepModel(Model):
|
||||||
c_ids += ids.shape[1]
|
c_ids += ids.shape[1]
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||||
|
if isinstance(self.state2vec.ops, CupyOps) \
|
||||||
|
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||||
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
|
self.backprops.append((
|
||||||
|
util.get_async(self.cuda_stream, token_ids),
|
||||||
|
util.get_async(self.cuda_stream, d_vector),
|
||||||
|
get_d_tokvecs
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||||
|
|
||||||
|
|
||||||
def finish_steps(self, golds):
|
def finish_steps(self, golds):
|
||||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||||
# values don't affect the real gradient.
|
# values don't affect the real gradient.
|
||||||
|
@ -289,11 +311,17 @@ class ParserStepModel(Model):
|
||||||
self.bp_tokvecs(d_tokvecs[:-1])
|
self.bp_tokvecs(d_tokvecs[:-1])
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
|
NUMPY_OPS = NumpyOps()
|
||||||
|
|
||||||
def step_forward(model: ParserStepModel, states, is_train):
|
def step_forward(model: ParserStepModel, states, is_train):
|
||||||
token_ids = model.get_token_ids(states)
|
token_ids = model.get_token_ids(states)
|
||||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||||
|
mask = None
|
||||||
if model.attrs["has_upper"]:
|
if model.attrs["has_upper"]:
|
||||||
|
dropout_rate = model.attrs["dropout_rate"]
|
||||||
|
if is_train and dropout_rate > 0:
|
||||||
|
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||||
|
vector *= mask
|
||||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||||
else:
|
else:
|
||||||
scores = NumpyOps().asarray(vector)
|
scores = NumpyOps().asarray(vector)
|
||||||
|
@ -305,16 +333,9 @@ def step_forward(model: ParserStepModel, states, is_train):
|
||||||
# Zero vectors for unseen classes
|
# Zero vectors for unseen classes
|
||||||
d_scores *= model._class_mask
|
d_scores *= model._class_mask
|
||||||
d_vector = get_d_vector(d_scores)
|
d_vector = get_d_vector(d_scores)
|
||||||
if isinstance(model.state2vec.ops, CupyOps) \
|
if mask is not None:
|
||||||
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
|
d_vector *= mask
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
||||||
model.backprops.append((
|
|
||||||
util.get_async(model.cuda_stream, token_ids),
|
|
||||||
util.get_async(model.cuda_stream, d_vector),
|
|
||||||
get_d_tokvecs
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
model.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
|
||||||
return None
|
return None
|
||||||
return scores, backprop_parser_step
|
return scores, backprop_parser_step
|
||||||
|
|
||||||
|
@ -437,7 +458,7 @@ cdef class precompute_hiddens:
|
||||||
sum_state_features(<float*>state_vector.data,
|
sum_state_features(<float*>state_vector.data,
|
||||||
feat_weights, &ids[0,0],
|
feat_weights, &ids[0,0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||||
state_vector = state_vector + self.bias
|
state_vector += self.bias
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
def backward(d_state_vector_ids):
|
def backward(d_state_vector_ids):
|
||||||
|
|
|
@ -65,7 +65,6 @@ cdef class Parser:
|
||||||
self.set_output(self.moves.n_moves)
|
self.set_output(self.moves.n_moves)
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
||||||
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
|
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
for multitask in cfg.get("multitasks", []):
|
for multitask in cfg.get("multitasks", []):
|
||||||
self.add_multitask_objective(multitask)
|
self.add_multitask_objective(multitask)
|
||||||
|
@ -154,7 +153,7 @@ cdef class Parser:
|
||||||
doc (Doc): The document to be processed.
|
doc (Doc): The document to be processed.
|
||||||
"""
|
"""
|
||||||
states = self.predict([doc])
|
states = self.predict([doc])
|
||||||
self.set_annotations([doc], states, tensors=None)
|
self.set_annotations([doc], states)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, docs, int batch_size=256):
|
def pipe(self, docs, int batch_size=256):
|
||||||
|
@ -171,7 +170,7 @@ cdef class Parser:
|
||||||
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
|
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
|
||||||
subbatch = list(subbatch)
|
subbatch = list(subbatch)
|
||||||
parse_states = self.predict(subbatch)
|
parse_states = self.predict(subbatch)
|
||||||
self.set_annotations(subbatch, parse_states, tensors=None)
|
self.set_annotations(subbatch, parse_states)
|
||||||
yield from batch_in_order
|
yield from batch_in_order
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -201,6 +200,8 @@ cdef class Parser:
|
||||||
with nogil:
|
with nogil:
|
||||||
self._parseC(&states[0],
|
self._parseC(&states[0],
|
||||||
weights, sizes)
|
weights, sizes)
|
||||||
|
model.clear_memory()
|
||||||
|
del model
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
cdef void _parseC(self, StateC** states,
|
cdef void _parseC(self, StateC** states,
|
||||||
|
@ -223,7 +224,7 @@ cdef class Parser:
|
||||||
unfinished.clear()
|
unfinished.clear()
|
||||||
free_activations(&activations)
|
free_activations(&activations)
|
||||||
|
|
||||||
def set_annotations(self, docs, states, tensors=None):
|
def set_annotations(self, docs, states):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
for i, (state, doc) in enumerate(zip(states, docs)):
|
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||||
|
@ -264,7 +265,7 @@ cdef class Parser:
|
||||||
states[i].push_hist(guess)
|
states[i].push_hist(guess)
|
||||||
free(is_valid)
|
free(is_valid)
|
||||||
|
|
||||||
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -280,11 +281,12 @@ cdef class Parser:
|
||||||
[eg.predicted for eg in examples])
|
[eg.predicted for eg in examples])
|
||||||
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
if self.cfg["update_with_oracle_cut_size"] >= 1:
|
||||||
# Chop sequences into lengths of this many transitions, to make the
|
# Chop sequences into lengths of this many transitions, to make the
|
||||||
# batch uniform length. We randomize this to overfit less.
|
# batch uniform length.
|
||||||
|
# We used to randomize this, but it's not clear that actually helps?
|
||||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
cut_size = self.cfg["update_with_oracle_cut_size"]
|
||||||
states, golds, max_steps = self._init_gold_batch(
|
states, golds, max_steps = self._init_gold_batch(
|
||||||
examples,
|
examples,
|
||||||
max_length=numpy.random.choice(range(5, cut_size))
|
max_length=cut_size
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||||
|
@ -292,24 +294,15 @@ cdef class Parser:
|
||||||
if not states:
|
if not states:
|
||||||
return losses
|
return losses
|
||||||
all_states = list(states)
|
all_states = list(states)
|
||||||
states_golds = zip(states, golds)
|
states_golds = list(zip(states, golds))
|
||||||
for _ in range(max_steps):
|
while states_golds:
|
||||||
if not states_golds:
|
|
||||||
break
|
|
||||||
states, golds = zip(*states_golds)
|
states, golds = zip(*states_golds)
|
||||||
scores, backprop = model.begin_update(states)
|
scores, backprop = model.begin_update(states)
|
||||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||||
if self.cfg["normalize_gradients_with_batch_size"]:
|
# Note that the gradient isn't normalized by the batch size
|
||||||
# We have to be very careful how we do this, because of the way we
|
# here, because our "samples" are really the states...But we
|
||||||
# cut up the batch. We subdivide long sequences. If we normalize
|
# can't normalize by the number of states either, as then we'd
|
||||||
# naively, we end up normalizing by sequence length, which
|
# be getting smaller gradients for states in long sequences.
|
||||||
# is bad: that would mean that states in long sequences
|
|
||||||
# consistently get smaller gradients. Imagine if we have two
|
|
||||||
# sequences, one length 1000, one length 20. If we cut up
|
|
||||||
# the 1k sequence so that we have a "batch" of 50 subsequences,
|
|
||||||
# we don't want the gradients to get 50 times smaller!
|
|
||||||
d_scores /= n_examples
|
|
||||||
|
|
||||||
backprop(d_scores)
|
backprop(d_scores)
|
||||||
# Follow the predicted action
|
# Follow the predicted action
|
||||||
self.transition_states(states, scores)
|
self.transition_states(states, scores)
|
||||||
|
@ -321,6 +314,13 @@ cdef class Parser:
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, all_states)
|
self.set_annotations(docs, all_states)
|
||||||
|
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||||
|
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||||
|
# removing these in time if we don't explicitly delete? It's confusing.
|
||||||
|
del backprop
|
||||||
|
del backprop_tok2vec
|
||||||
|
model.clear_memory()
|
||||||
|
del model
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||||
|
@ -344,7 +344,7 @@ cdef class Parser:
|
||||||
set_dropout_rate(self._rehearsal_model, 0.0)
|
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||||
set_dropout_rate(self.model, 0.0)
|
set_dropout_rate(self.model, 0.0)
|
||||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
tutor, _ = self._rehearsal_model.begin_update(docs)
|
||||||
model, finish_update = self.model.begin_update(docs)
|
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||||
n_scores = 0.
|
n_scores = 0.
|
||||||
loss = 0.
|
loss = 0.
|
||||||
while states:
|
while states:
|
||||||
|
@ -360,10 +360,16 @@ cdef class Parser:
|
||||||
states = [state for state in states if not state.is_final()]
|
states = [state for state in states if not state.is_final()]
|
||||||
n_scores += d_scores.size
|
n_scores += d_scores.size
|
||||||
# Do the backprop
|
# Do the backprop
|
||||||
finish_update(docs)
|
backprop_tok2vec(docs)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
losses[self.name] += loss / n_scores
|
losses[self.name] += loss / n_scores
|
||||||
|
del backprop
|
||||||
|
del backprop_tok2vec
|
||||||
|
model.clear_memory()
|
||||||
|
tutor.clear_memory()
|
||||||
|
del model
|
||||||
|
del tutor
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_gradients(self):
|
def get_gradients(self):
|
||||||
|
@ -407,6 +413,7 @@ cdef class Parser:
|
||||||
cpu_log_loss(c_d_scores,
|
cpu_log_loss(c_d_scores,
|
||||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||||
c_d_scores += d_scores.shape[1]
|
c_d_scores += d_scores.shape[1]
|
||||||
|
# Note that we don't normalize this. See comment in update() for why.
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
losses[self.name] += (d_scores**2).sum()
|
losses[self.name] += (d_scores**2).sum()
|
||||||
|
@ -525,21 +532,25 @@ cdef class Parser:
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||||
|
states = []
|
||||||
|
golds = []
|
||||||
kept = []
|
kept = []
|
||||||
max_length_seen = 0
|
max_length_seen = 0
|
||||||
for state, eg in zip(all_states, examples):
|
for state, eg in zip(all_states, examples):
|
||||||
if self.moves.has_gold(eg) and not state.is_final():
|
if self.moves.has_gold(eg) and not state.is_final():
|
||||||
gold = self.moves.init_gold(state, eg)
|
gold = self.moves.init_gold(state, eg)
|
||||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
if len(eg.x) < max_length:
|
||||||
state.copy(), gold)
|
states.append(state)
|
||||||
kept.append((eg, state, gold, oracle_actions))
|
golds.append(gold)
|
||||||
min_length = min(min_length, len(oracle_actions))
|
else:
|
||||||
max_length_seen = max(max_length, len(oracle_actions))
|
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||||
|
state.copy(), gold)
|
||||||
|
kept.append((eg, state, gold, oracle_actions))
|
||||||
|
min_length = min(min_length, len(oracle_actions))
|
||||||
|
max_length_seen = max(max_length, len(oracle_actions))
|
||||||
if not kept:
|
if not kept:
|
||||||
return [], [], 0
|
return states, golds, 0
|
||||||
max_length = max(min_length, min(max_length, max_length_seen))
|
max_length = max(min_length, min(max_length, max_length_seen))
|
||||||
states = []
|
|
||||||
golds = []
|
|
||||||
cdef int clas
|
cdef int clas
|
||||||
max_moves = 0
|
max_moves = 0
|
||||||
for eg, state, gold, oracle_actions in kept:
|
for eg, state, gold, oracle_actions in kept:
|
||||||
|
|
|
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
|
|
||||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
assert contains_cycle(tree) is None
|
assert contains_cycle(tree) is None
|
||||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
assert contains_cycle(cyclic_tree) == {3, 4, 5}
|
||||||
assert contains_cycle(partial_tree) is None
|
assert contains_cycle(partial_tree) is None
|
||||||
assert contains_cycle(multirooted_tree) is None
|
assert contains_cycle(multirooted_tree) is None
|
||||||
|
|
||||||
|
|
|
@ -198,10 +198,10 @@ def test_overfitting_IO():
|
||||||
nlp.add_pipe(parser)
|
nlp.add_pipe(parser)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(100):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
assert losses["parser"] < 0.00001
|
assert losses["parser"] < 0.0001
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like securities."
|
test_text = "I like securities."
|
||||||
|
|
|
@ -38,6 +38,11 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
# add some cases where SENT_START == -1
|
||||||
|
train_examples[0].reference[10].is_sent_start = False
|
||||||
|
train_examples[1].reference[1].is_sent_start = False
|
||||||
|
train_examples[1].reference[11].is_sent_start = False
|
||||||
|
|
||||||
nlp.add_pipe(senter)
|
nlp.add_pipe(senter)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
|
||||||
|
|
|
@ -84,7 +84,7 @@ def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
textcat = nlp.create_pipe("textcat")
|
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True})
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
|
|
@ -23,6 +23,7 @@ def test_issue2070():
|
||||||
assert len(doc) == 11
|
assert len(doc) == 11
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2179():
|
def test_issue2179():
|
||||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2482():
|
def test_issue2482():
|
||||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
|
|
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
|
||||||
assert doc[0].like_num
|
assert doc[0].like_num
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2800():
|
def test_issue2800():
|
||||||
"""Test issue that arises when too many labels are added to NER model.
|
"""Test issue that arises when too many labels are added to NER model.
|
||||||
Used to cause segfault.
|
Used to cause segfault.
|
||||||
"""
|
"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
train_data = []
|
train_data = []
|
||||||
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
|
train_data.extend(
|
||||||
|
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
|
||||||
|
)
|
||||||
entity_types = [str(i) for i in range(1000)]
|
entity_types = [str(i) for i in range(1000)]
|
||||||
ner = nlp.create_pipe("ner")
|
ner = nlp.create_pipe("ner")
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
|
|
|
@ -88,6 +88,7 @@ def test_issue3199():
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
assert list(doc[0:3].noun_chunks) == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||||
mapped to classes incorrectly after loading the model, when the labels
|
mapped to classes incorrectly after loading the model, when the labels
|
||||||
|
|
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
|
@ -0,0 +1,472 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.pipeline import EntityRuler, DependencyParser
|
||||||
|
from spacy.pipeline.defaults import default_parser
|
||||||
|
from spacy import displacy, load
|
||||||
|
from spacy.displacy import parse_deps
|
||||||
|
from spacy.tokens import Doc, Token
|
||||||
|
from spacy.matcher import Matcher, PhraseMatcher
|
||||||
|
from spacy.errors import MatchPatternError
|
||||||
|
from spacy.util import minibatch
|
||||||
|
from spacy.gold import Example
|
||||||
|
from spacy.lang.hi import Hindi
|
||||||
|
from spacy.lang.es import Spanish
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.attrs import IS_ALPHA
|
||||||
|
from thinc.api import compounding
|
||||||
|
import spacy
|
||||||
|
import srsly
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from ..util import make_tempdir, get_doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||||
|
def test_issue3521(en_tokenizer, word):
|
||||||
|
tok = en_tokenizer(word)[1]
|
||||||
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
|
assert tok.is_stop
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_1(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
ruler_bytes = ruler.to_bytes()
|
||||||
|
assert len(ruler) == len(patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
assert ruler.overwrite
|
||||||
|
new_ruler = EntityRuler(nlp)
|
||||||
|
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
assert len(new_ruler.labels) == 4
|
||||||
|
assert new_ruler.overwrite == ruler.overwrite
|
||||||
|
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_2(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||||
|
new_ruler = EntityRuler(nlp)
|
||||||
|
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
for pattern in ruler.patterns:
|
||||||
|
assert pattern in new_ruler.patterns
|
||||||
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_3(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
out_file = tmpdir / "entity_ruler"
|
||||||
|
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||||
|
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||||
|
for pattern in ruler.patterns:
|
||||||
|
assert pattern in new_ruler.patterns
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue_3526_4(en_vocab):
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||||
|
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
nlp.to_disk(tmpdir)
|
||||||
|
ruler = nlp.get_pipe("entity_ruler")
|
||||||
|
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
assert ruler.overwrite is True
|
||||||
|
nlp2 = load(tmpdir)
|
||||||
|
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||||
|
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
assert new_ruler.overwrite is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3531():
|
||||||
|
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||||
|
example_dep = {
|
||||||
|
"words": [
|
||||||
|
{"text": "But", "tag": "CCONJ"},
|
||||||
|
{"text": "Google", "tag": "PROPN"},
|
||||||
|
{"text": "is", "tag": "VERB"},
|
||||||
|
{"text": "starting", "tag": "VERB"},
|
||||||
|
{"text": "from", "tag": "ADP"},
|
||||||
|
{"text": "behind.", "tag": "ADV"},
|
||||||
|
],
|
||||||
|
"arcs": [
|
||||||
|
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||||
|
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||||
|
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
example_ent = {
|
||||||
|
"text": "But Google is starting from behind.",
|
||||||
|
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||||
|
}
|
||||||
|
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||||
|
assert dep_html
|
||||||
|
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||||
|
assert ent_html
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3540(en_vocab):
|
||||||
|
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
tensor = numpy.asarray(
|
||||||
|
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||||
|
dtype="f",
|
||||||
|
)
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
doc.tensor = tensor
|
||||||
|
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
assert [token.text for token in doc] == gold_text
|
||||||
|
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
assert [token.lemma_ for token in doc] == gold_lemma
|
||||||
|
vectors_1 = [token.vector for token in doc]
|
||||||
|
assert len(vectors_1) == len(doc)
|
||||||
|
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
heads = [(doc[3], 1), doc[2]]
|
||||||
|
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||||
|
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||||
|
|
||||||
|
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
|
assert [token.text for token in doc] == gold_text
|
||||||
|
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
|
assert [token.lemma_ for token in doc] == gold_lemma
|
||||||
|
vectors_2 = [token.vector for token in doc]
|
||||||
|
assert len(vectors_2) == len(doc)
|
||||||
|
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||||
|
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||||
|
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||||
|
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||||
|
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3549(en_vocab):
|
||||||
|
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||||
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
|
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||||
|
matcher.add("GOOD", [pattern])
|
||||||
|
with pytest.raises(MatchPatternError):
|
||||||
|
matcher.add("BAD", [[{"X": "Y"}]])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_issue3555(en_vocab):
|
||||||
|
"""Test that custom extensions with default None don't break matcher."""
|
||||||
|
Token.set_extension("issue3555", default=None)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["have", "apple"])
|
||||||
|
matcher(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3611():
|
||||||
|
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff",
|
||||||
|
]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
# preparing the data
|
||||||
|
train_data = []
|
||||||
|
for text, train_instance in zip(x_train, y_train):
|
||||||
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||||
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||||
|
# add a text categorizer component
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
|
)
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
# training the network
|
||||||
|
with nlp.select_pipes(enable="textcat"):
|
||||||
|
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(
|
||||||
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3625():
|
||||||
|
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||||
|
nlp = Hindi()
|
||||||
|
doc = nlp("hi. how हुए. होटल, होटल")
|
||||||
|
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||||
|
assert [token.text for token in doc] == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3803():
|
||||||
|
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||||
|
nlp = Spanish()
|
||||||
|
text = "2 dos 1000 mil 12 doce"
|
||||||
|
doc = nlp(text)
|
||||||
|
|
||||||
|
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3830_no_subtok():
|
||||||
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
parser.begin_training(lambda: [])
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3830_with_subtok():
|
||||||
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
|
config = {
|
||||||
|
"learn_tokens": True,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
parser.begin_training(lambda: [])
|
||||||
|
assert "subtok" in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3839(en_vocab):
|
||||||
|
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||||
|
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
match_id = "PATTERN"
|
||||||
|
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
|
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
|
matcher.add(match_id, [pattern1])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add(match_id, [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sentence",
|
||||||
|
[
|
||||||
|
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||||
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||||
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||||
|
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||||
|
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_issue3869(sentence):
|
||||||
|
"""Test that the Doc's count_by function works consistently"""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(sentence)
|
||||||
|
count = 0
|
||||||
|
for token in doc:
|
||||||
|
count += token.is_alpha
|
||||||
|
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3879(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||||
|
assert len(doc) == 5
|
||||||
|
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3880():
|
||||||
|
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||||
|
|
||||||
|
Fixed in v7.0.5 of Thinc.
|
||||||
|
"""
|
||||||
|
texts = ["hello", "world", "", ""]
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
|
nlp.get_pipe("parser").add_label("dep")
|
||||||
|
nlp.get_pipe("ner").add_label("PERSON")
|
||||||
|
nlp.get_pipe("tagger").add_label("NN")
|
||||||
|
nlp.begin_training()
|
||||||
|
for doc in nlp.pipe(texts):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3882(en_vocab):
|
||||||
|
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||||
|
copy of the Doc.
|
||||||
|
"""
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
|
doc.is_parsed = True
|
||||||
|
doc.user_data["test"] = set()
|
||||||
|
parse_deps(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3951(en_vocab):
|
||||||
|
"""Test that combinations of optional rules are matched correctly."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [
|
||||||
|
{"LOWER": "hello"},
|
||||||
|
{"LOWER": "this", "OP": "?"},
|
||||||
|
{"OP": "?"},
|
||||||
|
{"LOWER": "world"},
|
||||||
|
]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3959():
|
||||||
|
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(
|
||||||
|
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||||
|
)
|
||||||
|
assert doc[0].pos_ == ""
|
||||||
|
doc[0].pos_ = "NOUN"
|
||||||
|
assert doc[0].pos_ == "NOUN"
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = tmp_dir / "my_doc"
|
||||||
|
doc.to_disk(file_path)
|
||||||
|
doc2 = nlp("")
|
||||||
|
doc2.from_disk(file_path)
|
||||||
|
assert doc2[0].pos_ == "NOUN"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962(en_vocab):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
# fmt: off
|
||||||
|
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||||
|
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
span2 = doc[1:5] # "jests at scars ,"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
# head set to itself, being the new artificial root
|
||||||
|
assert doc2[0].head.text == "jests"
|
||||||
|
assert doc2[0].dep_ == "dep"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||||
|
assert doc2[3].dep_ == "dep"
|
||||||
|
# We should still have 1 sentence
|
||||||
|
assert len(list(doc2.sents)) == 1
|
||||||
|
span3 = doc[6:9] # "never felt a"
|
||||||
|
doc3 = span3.as_doc()
|
||||||
|
doc3_json = doc3.to_json()
|
||||||
|
assert doc3_json
|
||||||
|
assert doc3[0].head.text == "felt"
|
||||||
|
assert doc3[0].dep_ == "neg"
|
||||||
|
assert doc3[1].head.text == "felt"
|
||||||
|
assert doc3[1].dep_ == "ROOT"
|
||||||
|
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||||
|
assert doc3[2].dep_ == "dep"
|
||||||
|
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||||
|
assert len(list(doc3.sents)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962_long(en_vocab):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
# fmt: off
|
||||||
|
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||||
|
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
# head set to itself, being the new artificial root (in sentence 1)
|
||||||
|
assert doc2[0].head.text == "jests"
|
||||||
|
assert doc2[0].dep_ == "ROOT"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests"
|
||||||
|
assert doc2[3].dep_ == "punct"
|
||||||
|
# head set to itself, being the new artificial root (in sentence 2)
|
||||||
|
assert doc2[4].head.text == "They"
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
# head set to the new artificial head (in sentence 2)
|
||||||
|
assert doc2[4].head.text == "They"
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
# We should still have 2 sentences
|
||||||
|
sents = list(doc2.sents)
|
||||||
|
assert len(sents) == 2
|
||||||
|
assert sents[0].text == "jests at scars ."
|
||||||
|
assert sents[1].text == "They never"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3972(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
# We should have a match for each of the two rules
|
||||||
|
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||||
|
assert "A" in found_ids
|
||||||
|
assert "B" in found_ids
|
|
@ -1,8 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
|
||||||
def test_issue3521(en_tokenizer, word):
|
|
||||||
tok = en_tokenizer(word)[1]
|
|
||||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
|
||||||
assert tok.is_stop
|
|
|
@ -1,85 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
from spacy import load
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def patterns():
|
|
||||||
return [
|
|
||||||
{"label": "HELLO", "pattern": "hello world"},
|
|
||||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
|
||||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
|
||||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
|
||||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def add_ent():
|
|
||||||
def add_ent_component(doc):
|
|
||||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
|
||||||
return doc
|
|
||||||
|
|
||||||
return add_ent_component
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
ruler_bytes = ruler.to_bytes()
|
|
||||||
assert len(ruler) == len(patterns)
|
|
||||||
assert len(ruler.labels) == 4
|
|
||||||
assert ruler.overwrite
|
|
||||||
new_ruler = EntityRuler(nlp)
|
|
||||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
assert len(new_ruler.labels) == 4
|
|
||||||
assert new_ruler.overwrite == ruler.overwrite
|
|
||||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
|
||||||
new_ruler = EntityRuler(nlp)
|
|
||||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
for pattern in ruler.patterns:
|
|
||||||
assert pattern in new_ruler.patterns
|
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
out_file = tmpdir / "entity_ruler"
|
|
||||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
|
||||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
|
||||||
for pattern in ruler.patterns:
|
|
||||||
assert pattern in new_ruler.patterns
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
|
||||||
|
|
||||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
nlp.to_disk(tmpdir)
|
|
||||||
ruler = nlp.get_pipe("entity_ruler")
|
|
||||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
|
||||||
assert ruler.overwrite is True
|
|
||||||
nlp2 = load(tmpdir)
|
|
||||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
|
||||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
|
||||||
assert new_ruler.overwrite is True
|
|
|
@ -1,30 +0,0 @@
|
||||||
from spacy import displacy
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3531():
|
|
||||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
|
||||||
example_dep = {
|
|
||||||
"words": [
|
|
||||||
{"text": "But", "tag": "CCONJ"},
|
|
||||||
{"text": "Google", "tag": "PROPN"},
|
|
||||||
{"text": "is", "tag": "VERB"},
|
|
||||||
{"text": "starting", "tag": "VERB"},
|
|
||||||
{"text": "from", "tag": "ADP"},
|
|
||||||
{"text": "behind.", "tag": "ADV"},
|
|
||||||
],
|
|
||||||
"arcs": [
|
|
||||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
|
||||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
|
||||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
|
||||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
|
||||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
example_ent = {
|
|
||||||
"text": "But Google is starting from behind.",
|
|
||||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
|
||||||
}
|
|
||||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
|
||||||
assert dep_html
|
|
||||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
|
||||||
assert ent_html
|
|
|
@ -1,44 +0,0 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3540(en_vocab):
|
|
||||||
|
|
||||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
tensor = np.asarray(
|
|
||||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
|
||||||
dtype="f",
|
|
||||||
)
|
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
doc.tensor = tensor
|
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
assert [token.text for token in doc] == gold_text
|
|
||||||
|
|
||||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
assert [token.lemma_ for token in doc] == gold_lemma
|
|
||||||
|
|
||||||
vectors_1 = [token.vector for token in doc]
|
|
||||||
assert len(vectors_1) == len(doc)
|
|
||||||
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
heads = [(doc[3], 1), doc[2]]
|
|
||||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
|
||||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
|
||||||
assert [token.text for token in doc] == gold_text
|
|
||||||
|
|
||||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
|
||||||
assert [token.lemma_ for token in doc] == gold_lemma
|
|
||||||
|
|
||||||
vectors_2 = [token.vector for token in doc]
|
|
||||||
assert len(vectors_2) == len(doc)
|
|
||||||
|
|
||||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
|
||||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
|
||||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
|
||||||
|
|
||||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
|
||||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
|
@ -1,12 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.errors import MatchPatternError
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3549(en_vocab):
|
|
||||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
|
||||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
|
||||||
matcher.add("GOOD", [pattern])
|
|
||||||
with pytest.raises(MatchPatternError):
|
|
||||||
matcher.add("BAD", [[{"X": "Y"}]])
|
|
|
@ -1,14 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Doc, Token
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue3555(en_vocab):
|
|
||||||
"""Test that custom extensions with default None don't break matcher."""
|
|
||||||
Token.set_extension("issue3555", default=None)
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
doc = Doc(en_vocab, words=["have", "apple"])
|
|
||||||
matcher(doc)
|
|
|
@ -1,45 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3611():
|
|
||||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
|
||||||
x_train = [
|
|
||||||
"This is an offensive text",
|
|
||||||
"This is the second offensive text",
|
|
||||||
"inoff",
|
|
||||||
]
|
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
|
|
||||||
# preparing the data
|
|
||||||
train_data = []
|
|
||||||
for text, train_instance in zip(x_train, y_train):
|
|
||||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
||||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
||||||
|
|
||||||
# add a text categorizer component
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat",
|
|
||||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
||||||
)
|
|
||||||
|
|
||||||
for label in unique_classes:
|
|
||||||
textcat.add_label(label)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
|
|
||||||
# training the network
|
|
||||||
with nlp.select_pipes(enable="textcat"):
|
|
||||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
|
||||||
for i in range(3):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(
|
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
|
@ -1,9 +0,0 @@
|
||||||
from spacy.lang.hi import Hindi
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3625():
|
|
||||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
|
||||||
nlp = Hindi()
|
|
||||||
doc = nlp("hi. how हुए. होटल, होटल")
|
|
||||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
|
||||||
assert [token.text for token in doc] == expected
|
|
|
@ -1,10 +0,0 @@
|
||||||
from spacy.lang.es import Spanish
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3803():
|
|
||||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
|
||||||
nlp = Spanish()
|
|
||||||
text = "2 dos 1000 mil 12 doce"
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.pipeline.pipes import DependencyParser
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_parser
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_no_subtok():
|
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
|
||||||
parser.add_label("nsubj")
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
parser.begin_training(lambda: [])
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_with_subtok():
|
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
|
||||||
config = {
|
|
||||||
"learn_tokens": True,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
|
||||||
parser.add_label("nsubj")
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
parser.begin_training(lambda: [])
|
|
||||||
assert "subtok" in parser.labels
|
|
|
@ -1,18 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3839(en_vocab):
|
|
||||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
|
||||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
match_id = "PATTERN"
|
|
||||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
|
||||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
|
||||||
matcher.add(match_id, [pattern1])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add(match_id, [pattern2])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
|
@ -1,25 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.attrs import IS_ALPHA
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence",
|
|
||||||
[
|
|
||||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
|
||||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
|
||||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
|
||||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
|
||||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_issue3869(sentence):
|
|
||||||
"""Test that the Doc's count_by function works consistently"""
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(sentence)
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
for token in doc:
|
|
||||||
count += token.is_alpha
|
|
||||||
|
|
||||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3879(en_vocab):
|
|
||||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
|
||||||
assert len(doc) == 5
|
|
||||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
|
|
@ -1,21 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3880():
|
|
||||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
|
||||||
|
|
||||||
Fixed in v7.0.5 of Thinc.
|
|
||||||
"""
|
|
||||||
texts = ["hello", "world", "", ""]
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
|
||||||
nlp.get_pipe("parser").add_label("dep")
|
|
||||||
nlp.get_pipe("ner").add_label("PERSON")
|
|
||||||
nlp.get_pipe("tagger").add_label("NN")
|
|
||||||
nlp.begin_training()
|
|
||||||
for doc in nlp.pipe(texts):
|
|
||||||
pass
|
|
|
@ -1,12 +0,0 @@
|
||||||
from spacy.displacy import parse_deps
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3882(en_vocab):
|
|
||||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
|
||||||
copy of the Doc.
|
|
||||||
"""
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
|
||||||
doc.is_parsed = True
|
|
||||||
doc.user_data["test"] = set()
|
|
||||||
parse_deps(doc)
|
|
|
@ -1,17 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3951(en_vocab):
|
|
||||||
"""Test that combinations of optional rules are matched correctly."""
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [
|
|
||||||
{"LOWER": "hello"},
|
|
||||||
{"LOWER": "this", "OP": "?"},
|
|
||||||
{"OP": "?"},
|
|
||||||
{"LOWER": "world"},
|
|
||||||
]
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 0
|
|
|
@ -1,26 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3959():
|
|
||||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(
|
|
||||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
|
||||||
)
|
|
||||||
assert doc[0].pos_ == ""
|
|
||||||
|
|
||||||
doc[0].pos_ = "NOUN"
|
|
||||||
assert doc[0].pos_ == "NOUN"
|
|
||||||
|
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
file_path = tmp_dir / "my_doc"
|
|
||||||
doc.to_disk(file_path)
|
|
||||||
|
|
||||||
doc2 = nlp("")
|
|
||||||
doc2.from_disk(file_path)
|
|
||||||
|
|
||||||
assert doc2[0].pos_ == "NOUN"
|
|
|
@ -1,117 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def doc(en_tokenizer):
|
|
||||||
text = "He jests at scars, that never felt a wound."
|
|
||||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
|
||||||
deps = [
|
|
||||||
"nsubj",
|
|
||||||
"ccomp",
|
|
||||||
"prep",
|
|
||||||
"pobj",
|
|
||||||
"punct",
|
|
||||||
"nsubj",
|
|
||||||
"neg",
|
|
||||||
"ROOT",
|
|
||||||
"det",
|
|
||||||
"dobj",
|
|
||||||
"punct",
|
|
||||||
]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962(doc):
|
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
||||||
span2 = doc[1:5] # "jests at scars ,"
|
|
||||||
doc2 = span2.as_doc()
|
|
||||||
doc2_json = doc2.to_json()
|
|
||||||
assert doc2_json
|
|
||||||
|
|
||||||
assert (
|
|
||||||
doc2[0].head.text == "jests"
|
|
||||||
) # head set to itself, being the new artificial root
|
|
||||||
assert doc2[0].dep_ == "dep"
|
|
||||||
assert doc2[1].head.text == "jests"
|
|
||||||
assert doc2[1].dep_ == "prep"
|
|
||||||
assert doc2[2].head.text == "at"
|
|
||||||
assert doc2[2].dep_ == "pobj"
|
|
||||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
|
||||||
assert doc2[3].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 1 sentence
|
|
||||||
assert len(list(doc2.sents)) == 1
|
|
||||||
|
|
||||||
span3 = doc[6:9] # "never felt a"
|
|
||||||
doc3 = span3.as_doc()
|
|
||||||
doc3_json = doc3.to_json()
|
|
||||||
assert doc3_json
|
|
||||||
|
|
||||||
assert doc3[0].head.text == "felt"
|
|
||||||
assert doc3[0].dep_ == "neg"
|
|
||||||
assert doc3[1].head.text == "felt"
|
|
||||||
assert doc3[1].dep_ == "ROOT"
|
|
||||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
|
||||||
assert doc3[2].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
|
||||||
assert len(list(doc3.sents)) == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def two_sent_doc(en_tokenizer):
|
|
||||||
text = "He jests at scars. They never felt a wound."
|
|
||||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
|
||||||
deps = [
|
|
||||||
"nsubj",
|
|
||||||
"ROOT",
|
|
||||||
"prep",
|
|
||||||
"pobj",
|
|
||||||
"punct",
|
|
||||||
"nsubj",
|
|
||||||
"neg",
|
|
||||||
"ROOT",
|
|
||||||
"det",
|
|
||||||
"dobj",
|
|
||||||
"punct",
|
|
||||||
]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962_long(two_sent_doc):
|
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
||||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
|
||||||
doc2 = span2.as_doc()
|
|
||||||
doc2_json = doc2.to_json()
|
|
||||||
assert doc2_json
|
|
||||||
|
|
||||||
assert (
|
|
||||||
doc2[0].head.text == "jests"
|
|
||||||
) # head set to itself, being the new artificial root (in sentence 1)
|
|
||||||
assert doc2[0].dep_ == "ROOT"
|
|
||||||
assert doc2[1].head.text == "jests"
|
|
||||||
assert doc2[1].dep_ == "prep"
|
|
||||||
assert doc2[2].head.text == "at"
|
|
||||||
assert doc2[2].dep_ == "pobj"
|
|
||||||
assert doc2[3].head.text == "jests"
|
|
||||||
assert doc2[3].dep_ == "punct"
|
|
||||||
assert (
|
|
||||||
doc2[4].head.text == "They"
|
|
||||||
) # head set to itself, being the new artificial root (in sentence 2)
|
|
||||||
assert doc2[4].dep_ == "dep"
|
|
||||||
assert (
|
|
||||||
doc2[4].head.text == "They"
|
|
||||||
) # head set to the new artificial head (in sentence 2)
|
|
||||||
assert doc2[4].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 2 sentences
|
|
||||||
sents = list(doc2.sents)
|
|
||||||
assert len(sents) == 2
|
|
||||||
assert sents[0].text == "jests at scars ."
|
|
||||||
assert sents[1].text == "They never"
|
|
|
@ -1,19 +0,0 @@
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3972(en_vocab):
|
|
||||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
|
||||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
|
||||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
|
||||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
|
||||||
matches = matcher(doc)
|
|
||||||
|
|
||||||
assert len(matches) == 2
|
|
||||||
|
|
||||||
# We should have a match for each of the two rules
|
|
||||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
|
||||||
assert "A" in found_ids
|
|
||||||
assert "B" in found_ids
|
|
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
|
@ -0,0 +1,469 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
|
||||||
|
from spacy.pipeline.defaults import default_ner
|
||||||
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
|
from spacy.gold import Example, Corpus
|
||||||
|
from spacy.gold.converters import json2docs
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.util import minibatch, ensure_path, load_model
|
||||||
|
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.lang.el import Greek
|
||||||
|
from spacy.language import Language
|
||||||
|
import spacy
|
||||||
|
from thinc.api import compounding
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4002(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||||
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||||
|
matcher.add("TEST", [pattern1])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||||
|
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
|
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||||
|
pattern2[0].norm_ = "c"
|
||||||
|
pattern2[1].norm_ = "d"
|
||||||
|
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||||
|
matcher.add("TEST", [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4030():
|
||||||
|
""" Test whether textcat works fine with empty doc """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff",
|
||||||
|
]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
# preparing the data
|
||||||
|
train_data = []
|
||||||
|
for text, train_instance in zip(x_train, y_train):
|
||||||
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||||
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||||
|
# add a text categorizer component
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
|
)
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
# training the network
|
||||||
|
with nlp.select_pipes(enable="textcat"):
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(
|
||||||
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||||
|
)
|
||||||
|
# processing of an empty doc should result in 0.0 for all categories
|
||||||
|
doc = nlp("")
|
||||||
|
assert doc.cats["offensive"] == 0.0
|
||||||
|
assert doc.cats["inoffensive"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4042():
|
||||||
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
# Add entity ruler
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [
|
||||||
|
{"label": "MY_ORG", "pattern": "Apple"},
|
||||||
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||||
|
]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||||
|
doc1 = nlp("What do you think about Apple ?")
|
||||||
|
assert doc1.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
|
||||||
|
nlp2 = load_model(output_dir)
|
||||||
|
doc2 = nlp2("What do you think about Apple ?")
|
||||||
|
assert doc2.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4042_bug2():
|
||||||
|
"""
|
||||||
|
Test that serialization of an NER works fine when new labels were added.
|
||||||
|
This is the second bug of two bugs underlying the issue 4042.
|
||||||
|
"""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab = nlp1.vocab
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner1 = nlp1.create_pipe("ner")
|
||||||
|
ner1.add_label("SOME_LABEL")
|
||||||
|
nlp1.add_pipe(ner1)
|
||||||
|
nlp1.begin_training()
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc1 = nlp1("What do you think about Apple ?")
|
||||||
|
assert len(ner1.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||||
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# reapply the NER - at this point it should resize itself
|
||||||
|
ner1(doc1)
|
||||||
|
assert len(ner1.labels) == 2
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
assert "MY_ORG" in ner1.labels
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
# assert IO goes fine
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||||
|
ner2.from_disk(output_dir)
|
||||||
|
assert len(ner2.labels) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4054(en_vocab):
|
||||||
|
"""Test that a new blank model can be made with a vocab from file,
|
||||||
|
and that serialization does not drop the language at any point."""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab1 = nlp1.vocab
|
||||||
|
with make_tempdir() as d:
|
||||||
|
vocab_dir = ensure_path(d / "vocab")
|
||||||
|
if not vocab_dir.exists():
|
||||||
|
vocab_dir.mkdir()
|
||||||
|
vocab1.to_disk(vocab_dir)
|
||||||
|
vocab2 = Vocab().from_disk(vocab_dir)
|
||||||
|
print("lang", vocab2.lang)
|
||||||
|
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||||
|
nlp_dir = ensure_path(d / "nlp")
|
||||||
|
if not nlp_dir.exists():
|
||||||
|
nlp_dir.mkdir()
|
||||||
|
nlp2.to_disk(nlp_dir)
|
||||||
|
nlp3 = load_model(nlp_dir)
|
||||||
|
assert nlp3.lang == "en"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4120(en_vocab):
|
||||||
|
"""Test that matches without a final {OP: ?} token are returned."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||||
|
doc1 = Doc(en_vocab, words=["a"])
|
||||||
|
assert len(matcher(doc1)) == 1 # works
|
||||||
|
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc2)) == 2 # fixed
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||||
|
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
|
assert len(matcher(doc3)) == 2 # works
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||||
|
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
|
assert len(matcher(doc4)) == 3 # fixed
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4133(en_vocab):
|
||||||
|
nlp = English()
|
||||||
|
vocab_bytes = nlp.vocab.to_bytes()
|
||||||
|
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||||
|
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
token.pos_ = pos[i]
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
doc_bytes = doc.to_bytes()
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab = vocab.from_bytes(vocab_bytes)
|
||||||
|
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||||
|
actual = []
|
||||||
|
for token in doc:
|
||||||
|
actual.append(token.pos_)
|
||||||
|
assert actual == pos
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4190():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||||||
|
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||||||
|
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||||||
|
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||||
|
exceptions = {
|
||||||
|
k: v
|
||||||
|
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||||
|
if not (len(k) == 2 and k[1] == ".")
|
||||||
|
}
|
||||||
|
new_tokenizer = Tokenizer(
|
||||||
|
nlp.vocab,
|
||||||
|
exceptions,
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=nlp.tokenizer.token_match,
|
||||||
|
)
|
||||||
|
nlp.tokenizer = new_tokenizer
|
||||||
|
|
||||||
|
test_string = "Test c."
|
||||||
|
# Load default language
|
||||||
|
nlp_1 = English()
|
||||||
|
doc_1a = nlp_1(test_string)
|
||||||
|
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||||
|
# Modify tokenizer
|
||||||
|
customize_tokenizer(nlp_1)
|
||||||
|
doc_1b = nlp_1(test_string)
|
||||||
|
result_1b = [token.text for token in doc_1b]
|
||||||
|
# Save and Reload
|
||||||
|
with make_tempdir() as model_dir:
|
||||||
|
nlp_1.to_disk(model_dir)
|
||||||
|
nlp_2 = load_model(model_dir)
|
||||||
|
# This should be the modified tokenizer
|
||||||
|
doc_2 = nlp_2(test_string)
|
||||||
|
result_2 = [token.text for token in doc_2]
|
||||||
|
assert result_1b == result_2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4267():
|
||||||
|
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("PEOPLE")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
# assert that we have correct IOB annotations
|
||||||
|
doc1 = nlp("hi")
|
||||||
|
assert doc1.is_nered
|
||||||
|
for token in doc1:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
# add entity ruler and run again
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
assert "entity_ruler" in nlp.pipe_names
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
# assert that we still have correct IOB annotations
|
||||||
|
doc2 = nlp("hi")
|
||||||
|
assert doc2.is_nered
|
||||||
|
for token in doc2:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4272():
|
||||||
|
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||||
|
are available."""
|
||||||
|
nlp = Greek()
|
||||||
|
doc = nlp("Χθες")
|
||||||
|
assert doc[0].lemma_
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_predictions():
|
||||||
|
class DummyPipe(Pipe):
|
||||||
|
def __init__(self):
|
||||||
|
self.model = "dummy_model"
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
return ([1, 2, 3], [4, 5, 6])
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores):
|
||||||
|
return docs
|
||||||
|
|
||||||
|
nlp = Language()
|
||||||
|
doc = nlp.make_doc("foo")
|
||||||
|
dummy_pipe = DummyPipe()
|
||||||
|
dummy_pipe(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
|
||||||
|
def test_issue4313():
|
||||||
|
""" This should not crash or exit with some strange error code """
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
nlp = English()
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
ner.begin_training([])
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc = nlp("What do you think about Apple ?")
|
||||||
|
assert len(ner.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner.labels
|
||||||
|
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||||
|
doc.ents = list(doc.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# ensure the beam_parse still works with the new label
|
||||||
|
docs = [doc]
|
||||||
|
beams = nlp.entity.beam_parse(
|
||||||
|
docs, beam_width=beam_width, beam_density=beam_density
|
||||||
|
)
|
||||||
|
|
||||||
|
for doc, beam in zip(docs, beams):
|
||||||
|
entity_scores = defaultdict(float)
|
||||||
|
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||||
|
for start, end, label in ents:
|
||||||
|
entity_scores[(start, end, label)] += score
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4348():
|
||||||
|
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||||
|
nlp = English()
|
||||||
|
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||||
|
TRAIN_DATA = [example, example]
|
||||||
|
tagger = nlp.create_pipe("tagger")
|
||||||
|
nlp.add_pipe(tagger)
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(5):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4367():
|
||||||
|
"""Test that docbin init goes well"""
|
||||||
|
DocBin()
|
||||||
|
DocBin(attrs=["LEMMA"])
|
||||||
|
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4373():
|
||||||
|
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||||
|
matcher = Matcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
matcher = PhraseMatcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4402():
|
||||||
|
json_data = {
|
||||||
|
"id": 0,
|
||||||
|
"paragraphs": [
|
||||||
|
{
|
||||||
|
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||||
|
"sentences": [
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 0, "orth": "How", "ner": "O"},
|
||||||
|
{"id": 1, "orth": "should", "ner": "O"},
|
||||||
|
{"id": 2, "orth": "I", "ner": "O"},
|
||||||
|
{"id": 3, "orth": "cook", "ner": "O"},
|
||||||
|
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||||
|
{"id": 5, "orth": "in", "ner": "O"},
|
||||||
|
{"id": 6, "orth": "an", "ner": "O"},
|
||||||
|
{"id": 7, "orth": "oven", "ner": "O"},
|
||||||
|
{"id": 8, "orth": "?", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 9, "orth": "\n", "ner": "O"},
|
||||||
|
{"id": 10, "orth": "I", "ner": "O"},
|
||||||
|
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||||
|
{"id": 12, "orth": "heard", "ner": "O"},
|
||||||
|
{"id": 13, "orth": "of", "ner": "O"},
|
||||||
|
{"id": 14, "orth": "people", "ner": "O"},
|
||||||
|
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||||
|
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||||
|
{"id": 17, "orth": "in", "ner": "O"},
|
||||||
|
{"id": 18, "orth": "an", "ner": "O"},
|
||||||
|
{"id": 19, "orth": "oven", "ner": "O"},
|
||||||
|
{"id": 20, "orth": ".", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"cats": [
|
||||||
|
{"label": "baking", "value": 1.0},
|
||||||
|
{"label": "not_baking", "value": 0.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"raw": "What is the difference between white and brown eggs?\n",
|
||||||
|
"sentences": [
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 0, "orth": "What", "ner": "O"},
|
||||||
|
{"id": 1, "orth": "is", "ner": "O"},
|
||||||
|
{"id": 2, "orth": "the", "ner": "O"},
|
||||||
|
{"id": 3, "orth": "difference", "ner": "O"},
|
||||||
|
{"id": 4, "orth": "between", "ner": "O"},
|
||||||
|
{"id": 5, "orth": "white", "ner": "O"},
|
||||||
|
{"id": 6, "orth": "and", "ner": "O"},
|
||||||
|
{"id": 7, "orth": "brown", "ner": "O"},
|
||||||
|
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||||
|
{"id": 9, "orth": "?", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||||
|
],
|
||||||
|
"cats": [
|
||||||
|
{"label": "baking", "value": 0.0},
|
||||||
|
{"label": "not_baking", "value": 1.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
nlp = English()
|
||||||
|
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
output_file = tmpdir / "test4402.spacy"
|
||||||
|
docs = json2docs([json_data])
|
||||||
|
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||||
|
with output_file.open("wb") as file_:
|
||||||
|
file_.write(data)
|
||||||
|
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||||
|
|
||||||
|
train_data = list(corpus.train_dataset(nlp))
|
||||||
|
assert len(train_data) == 2
|
||||||
|
|
||||||
|
split_train_data = []
|
||||||
|
for eg in train_data:
|
||||||
|
split_train_data.extend(eg.split_sents())
|
||||||
|
assert len(split_train_data) == 4
|
|
@ -1,23 +0,0 @@
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4002(en_vocab):
|
|
||||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
||||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
|
||||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
|
||||||
matcher.add("TEST", [pattern1])
|
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
|
||||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
||||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
|
||||||
pattern2[0].norm_ = "c"
|
|
||||||
pattern2[1].norm_ = "d"
|
|
||||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
|
||||||
matcher.add("TEST", [pattern2])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
|
@ -1,50 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4030():
|
|
||||||
""" Test whether textcat works fine with empty doc """
|
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
|
||||||
x_train = [
|
|
||||||
"This is an offensive text",
|
|
||||||
"This is the second offensive text",
|
|
||||||
"inoff",
|
|
||||||
]
|
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
|
|
||||||
# preparing the data
|
|
||||||
train_data = []
|
|
||||||
for text, train_instance in zip(x_train, y_train):
|
|
||||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
||||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
||||||
|
|
||||||
# add a text categorizer component
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat",
|
|
||||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
||||||
)
|
|
||||||
|
|
||||||
for label in unique_classes:
|
|
||||||
textcat.add_label(label)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
|
|
||||||
# training the network
|
|
||||||
with nlp.select_pipes(enable="textcat"):
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(3):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(
|
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
# processing of an empty doc should result in 0.0 for all categories
|
|
||||||
doc = nlp("")
|
|
||||||
assert doc.cats["offensive"] == 0.0
|
|
||||||
assert doc.cats["inoffensive"] == 0.0
|
|
|
@ -1,85 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4042():
|
|
||||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
# add ner pipe
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_label("SOME_LABEL")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
# Add entity ruler
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [
|
|
||||||
{"label": "MY_ORG", "pattern": "Apple"},
|
|
||||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
|
||||||
]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
|
||||||
doc1 = nlp("What do you think about Apple ?")
|
|
||||||
assert doc1.ents[0].label_ == "MY_ORG"
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
output_dir = ensure_path(d)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
doc2 = nlp2("What do you think about Apple ?")
|
|
||||||
assert doc2.ents[0].label_ == "MY_ORG"
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4042_bug2():
|
|
||||||
"""
|
|
||||||
Test that serialization of an NER works fine when new labels were added.
|
|
||||||
This is the second bug of two bugs underlying the issue 4042.
|
|
||||||
"""
|
|
||||||
nlp1 = English()
|
|
||||||
vocab = nlp1.vocab
|
|
||||||
|
|
||||||
# add ner pipe
|
|
||||||
ner1 = nlp1.create_pipe("ner")
|
|
||||||
ner1.add_label("SOME_LABEL")
|
|
||||||
nlp1.add_pipe(ner1)
|
|
||||||
nlp1.begin_training()
|
|
||||||
|
|
||||||
# add a new label to the doc
|
|
||||||
doc1 = nlp1("What do you think about Apple ?")
|
|
||||||
assert len(ner1.labels) == 1
|
|
||||||
assert "SOME_LABEL" in ner1.labels
|
|
||||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
|
||||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
|
||||||
|
|
||||||
# reapply the NER - at this point it should resize itself
|
|
||||||
ner1(doc1)
|
|
||||||
assert len(ner1.labels) == 2
|
|
||||||
assert "SOME_LABEL" in ner1.labels
|
|
||||||
assert "MY_ORG" in ner1.labels
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
# assert IO goes fine
|
|
||||||
output_dir = ensure_path(d)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
ner1.to_disk(output_dir)
|
|
||||||
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
|
||||||
ner2.from_disk(output_dir)
|
|
||||||
assert len(ner2.labels) == 2
|
|
|
@ -1,30 +0,0 @@
|
||||||
from spacy.vocab import Vocab
|
|
||||||
import spacy
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4054(en_vocab):
|
|
||||||
"""Test that a new blank model can be made with a vocab from file,
|
|
||||||
and that serialization does not drop the language at any point."""
|
|
||||||
nlp1 = English()
|
|
||||||
vocab1 = nlp1.vocab
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
vocab_dir = ensure_path(d / "vocab")
|
|
||||||
if not vocab_dir.exists():
|
|
||||||
vocab_dir.mkdir()
|
|
||||||
vocab1.to_disk(vocab_dir)
|
|
||||||
|
|
||||||
vocab2 = Vocab().from_disk(vocab_dir)
|
|
||||||
print("lang", vocab2.lang)
|
|
||||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
|
||||||
|
|
||||||
nlp_dir = ensure_path(d / "nlp")
|
|
||||||
if not nlp_dir.exists():
|
|
||||||
nlp_dir.mkdir()
|
|
||||||
nlp2.to_disk(nlp_dir)
|
|
||||||
nlp3 = spacy.load(nlp_dir)
|
|
||||||
assert nlp3.lang == "en"
|
|
|
@ -1,23 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4120(en_vocab):
|
|
||||||
"""Test that matches without a final {OP: ?} token are returned."""
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
|
||||||
doc1 = Doc(en_vocab, words=["a"])
|
|
||||||
assert len(matcher(doc1)) == 1 # works
|
|
||||||
|
|
||||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
|
||||||
assert len(matcher(doc2)) == 2 # fixed
|
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
|
||||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
|
||||||
assert len(matcher(doc3)) == 2 # works
|
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
|
||||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
|
||||||
assert len(matcher(doc4)) == 3 # fixed
|
|
|
@ -1,28 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4133(en_vocab):
|
|
||||||
nlp = English()
|
|
||||||
vocab_bytes = nlp.vocab.to_bytes()
|
|
||||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
|
||||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
for i, token in enumerate(doc):
|
|
||||||
token.pos_ = pos[i]
|
|
||||||
|
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
doc_bytes = doc.to_bytes()
|
|
||||||
|
|
||||||
vocab = Vocab()
|
|
||||||
vocab = vocab.from_bytes(vocab_bytes)
|
|
||||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
|
||||||
|
|
||||||
actual = []
|
|
||||||
for token in doc:
|
|
||||||
actual.append(token.pos_)
|
|
||||||
|
|
||||||
assert actual == pos
|
|
|
@ -1,46 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokenizer import Tokenizer
|
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4190():
|
|
||||||
test_string = "Test c."
|
|
||||||
# Load default language
|
|
||||||
nlp_1 = English()
|
|
||||||
doc_1a = nlp_1(test_string)
|
|
||||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
|
||||||
# Modify tokenizer
|
|
||||||
customize_tokenizer(nlp_1)
|
|
||||||
doc_1b = nlp_1(test_string)
|
|
||||||
result_1b = [token.text for token in doc_1b]
|
|
||||||
# Save and Reload
|
|
||||||
with make_tempdir() as model_dir:
|
|
||||||
nlp_1.to_disk(model_dir)
|
|
||||||
nlp_2 = util.load_model(model_dir)
|
|
||||||
# This should be the modified tokenizer
|
|
||||||
doc_2 = nlp_2(test_string)
|
|
||||||
result_2 = [token.text for token in doc_2]
|
|
||||||
assert result_1b == result_2
|
|
||||||
|
|
||||||
|
|
||||||
def customize_tokenizer(nlp):
|
|
||||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
|
||||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
|
||||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
|
||||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
|
||||||
exceptions = {
|
|
||||||
k: v
|
|
||||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
|
||||||
if not (len(k) == 2 and k[1] == ".")
|
|
||||||
}
|
|
||||||
new_tokenizer = Tokenizer(
|
|
||||||
nlp.vocab,
|
|
||||||
exceptions,
|
|
||||||
prefix_search=prefix_re.search,
|
|
||||||
suffix_search=suffix_re.search,
|
|
||||||
infix_finditer=infix_re.finditer,
|
|
||||||
token_match=nlp.tokenizer.token_match,
|
|
||||||
)
|
|
||||||
nlp.tokenizer = new_tokenizer
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4267():
|
|
||||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_label("PEOPLE")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
assert "ner" in nlp.pipe_names
|
|
||||||
|
|
||||||
# assert that we have correct IOB annotations
|
|
||||||
doc1 = nlp("hi")
|
|
||||||
assert doc1.is_nered
|
|
||||||
for token in doc1:
|
|
||||||
assert token.ent_iob == 2
|
|
||||||
|
|
||||||
# add entity ruler and run again
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
|
||||||
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
assert "entity_ruler" in nlp.pipe_names
|
|
||||||
assert "ner" in nlp.pipe_names
|
|
||||||
|
|
||||||
# assert that we still have correct IOB annotations
|
|
||||||
doc2 = nlp("hi")
|
|
||||||
assert doc2.is_nered
|
|
||||||
for token in doc2:
|
|
||||||
assert token.ent_iob == 2
|
|
|
@ -1,9 +0,0 @@
|
||||||
from spacy.lang.el import Greek
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4272():
|
|
||||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
|
||||||
are available."""
|
|
||||||
nlp = Greek()
|
|
||||||
doc = nlp("Χθες")
|
|
||||||
assert doc[0].lemma_
|
|
|
@ -1,25 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.pipeline import Pipe
|
|
||||||
|
|
||||||
|
|
||||||
class DummyPipe(Pipe):
|
|
||||||
def __init__(self):
|
|
||||||
self.model = "dummy_model"
|
|
||||||
|
|
||||||
def predict(self, docs):
|
|
||||||
return ([1, 2, 3], [4, 5, 6])
|
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def nlp():
|
|
||||||
return Language()
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_predictions(nlp):
|
|
||||||
doc = nlp.make_doc("foo")
|
|
||||||
dummy_pipe = DummyPipe()
|
|
||||||
dummy_pipe(doc)
|
|
|
@ -1,47 +0,0 @@
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
|
||||||
from spacy.pipeline import EntityRecognizer
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span
|
|
||||||
|
|
||||||
|
|
||||||
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
|
||||||
@pytest.mark.skip
|
|
||||||
def test_issue4313():
|
|
||||||
""" This should not crash or exit with some strange error code """
|
|
||||||
beam_width = 16
|
|
||||||
beam_density = 0.0001
|
|
||||||
nlp = English()
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
|
||||||
ner.add_label("SOME_LABEL")
|
|
||||||
ner.begin_training([])
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
|
|
||||||
# add a new label to the doc
|
|
||||||
doc = nlp("What do you think about Apple ?")
|
|
||||||
assert len(ner.labels) == 1
|
|
||||||
assert "SOME_LABEL" in ner.labels
|
|
||||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
|
||||||
doc.ents = list(doc.ents) + [apple_ent]
|
|
||||||
|
|
||||||
# ensure the beam_parse still works with the new label
|
|
||||||
docs = [doc]
|
|
||||||
beams = nlp.entity.beam_parse(
|
|
||||||
docs, beam_width=beam_width, beam_density=beam_density
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc, beam in zip(docs, beams):
|
|
||||||
entity_scores = defaultdict(float)
|
|
||||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
|
||||||
for start, end, label in ents:
|
|
||||||
entity_scores[(start, end, label)] += score
|
|
|
@ -1,24 +0,0 @@
|
||||||
from spacy.gold import Example
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4348():
|
|
||||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
|
||||||
TRAIN_DATA = [example, example]
|
|
||||||
|
|
||||||
tagger = nlp.create_pipe("tagger")
|
|
||||||
nlp.add_pipe(tagger)
|
|
||||||
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(5):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.tokens import DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4367():
|
|
||||||
"""Test that docbin init goes well"""
|
|
||||||
DocBin()
|
|
||||||
DocBin(attrs=["LEMMA"])
|
|
||||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
|
|
@ -1,10 +0,0 @@
|
||||||
from spacy.matcher import Matcher, PhraseMatcher
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4373():
|
|
||||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
|
||||||
matcher = Matcher(Vocab())
|
|
||||||
assert isinstance(matcher.vocab, Vocab)
|
|
||||||
matcher = PhraseMatcher(Vocab())
|
|
||||||
assert isinstance(matcher.vocab, Vocab)
|
|
|
@ -1,98 +0,0 @@
|
||||||
from spacy.gold import Corpus
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
from ...gold.converters import json2docs
|
|
||||||
from ...tokens import DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4402():
|
|
||||||
nlp = English()
|
|
||||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
output_file = tmpdir / "test4402.spacy"
|
|
||||||
docs = json2docs([json_data])
|
|
||||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
|
||||||
with output_file.open("wb") as file_:
|
|
||||||
file_.write(data)
|
|
||||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
|
||||||
|
|
||||||
train_data = list(corpus.train_dataset(nlp))
|
|
||||||
assert len(train_data) == 2
|
|
||||||
|
|
||||||
split_train_data = []
|
|
||||||
for eg in train_data:
|
|
||||||
split_train_data.extend(eg.split_sents())
|
|
||||||
assert len(split_train_data) == 4
|
|
||||||
|
|
||||||
|
|
||||||
json_data = {
|
|
||||||
"id": 0,
|
|
||||||
"paragraphs": [
|
|
||||||
{
|
|
||||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 0, "orth": "How", "ner": "O"},
|
|
||||||
{"id": 1, "orth": "should", "ner": "O"},
|
|
||||||
{"id": 2, "orth": "I", "ner": "O"},
|
|
||||||
{"id": 3, "orth": "cook", "ner": "O"},
|
|
||||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
|
||||||
{"id": 5, "orth": "in", "ner": "O"},
|
|
||||||
{"id": 6, "orth": "an", "ner": "O"},
|
|
||||||
{"id": 7, "orth": "oven", "ner": "O"},
|
|
||||||
{"id": 8, "orth": "?", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 9, "orth": "\n", "ner": "O"},
|
|
||||||
{"id": 10, "orth": "I", "ner": "O"},
|
|
||||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
|
||||||
{"id": 12, "orth": "heard", "ner": "O"},
|
|
||||||
{"id": 13, "orth": "of", "ner": "O"},
|
|
||||||
{"id": 14, "orth": "people", "ner": "O"},
|
|
||||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
|
||||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
|
||||||
{"id": 17, "orth": "in", "ner": "O"},
|
|
||||||
{"id": 18, "orth": "an", "ner": "O"},
|
|
||||||
{"id": 19, "orth": "oven", "ner": "O"},
|
|
||||||
{"id": 20, "orth": ".", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"cats": [
|
|
||||||
{"label": "baking", "value": 1.0},
|
|
||||||
{"label": "not_baking", "value": 0.0},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"raw": "What is the difference between white and brown eggs?\n",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 0, "orth": "What", "ner": "O"},
|
|
||||||
{"id": 1, "orth": "is", "ner": "O"},
|
|
||||||
{"id": 2, "orth": "the", "ner": "O"},
|
|
||||||
{"id": 3, "orth": "difference", "ner": "O"},
|
|
||||||
{"id": 4, "orth": "between", "ner": "O"},
|
|
||||||
{"id": 5, "orth": "white", "ner": "O"},
|
|
||||||
{"id": 6, "orth": "and", "ner": "O"},
|
|
||||||
{"id": 7, "orth": "brown", "ner": "O"},
|
|
||||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
|
||||||
{"id": 9, "orth": "?", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
|
||||||
],
|
|
||||||
"cats": [
|
|
||||||
{"label": "baking", "value": 0.0},
|
|
||||||
{"label": "not_baking", "value": 1.0},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
import pytest
|
||||||
|
from mock import Mock
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.matcher import DependencyMatcher
|
||||||
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
|
from spacy.gold import Example
|
||||||
|
from spacy.gold.converters.conllu2docs import conllu2docs
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.kb import KnowledgeBase
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.util import ensure_path, load_model_from_path
|
||||||
|
import numpy
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from ..util import get_doc, make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4528(en_vocab):
|
||||||
|
"""Test that user_data is correctly serialized in DocBin."""
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
doc.user_data["foo"] = "bar"
|
||||||
|
# This is how extension attribute values are stored in the user data
|
||||||
|
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||||
|
doc_bin = DocBin(store_user_data=True)
|
||||||
|
doc_bin.add(doc)
|
||||||
|
doc_bin_bytes = doc_bin.to_bytes()
|
||||||
|
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||||
|
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||||
|
assert new_doc.user_data["foo"] == "bar"
|
||||||
|
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||||
|
)
|
||||||
|
def test_gold_misaligned(en_tokenizer, text, words):
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
Example.from_dict(doc, {"words": words})
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4590(en_vocab):
|
||||||
|
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||||
|
pattern = [
|
||||||
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
on_match = Mock()
|
||||||
|
matcher = DependencyMatcher(en_vocab)
|
||||||
|
matcher.add("pattern", on_match, pattern)
|
||||||
|
text = "The quick brown fox jumped over the lazy fox"
|
||||||
|
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
|
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||||
|
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||||
|
matches = matcher(doc)
|
||||||
|
on_match_args = on_match.call_args
|
||||||
|
assert on_match_args[0][3] == matches
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4651_with_phrase_matcher_attr():
|
||||||
|
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||||
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||||
|
specified.
|
||||||
|
"""
|
||||||
|
text = "Spacy is a python library for nlp"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||||
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||||
|
nlp_reloaded = English()
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "entityruler"
|
||||||
|
ruler.to_disk(file_path)
|
||||||
|
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||||
|
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||||
|
doc_reloaded = nlp_reloaded(text)
|
||||||
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||||
|
assert res == res_reloaded
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4651_without_phrase_matcher_attr():
|
||||||
|
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||||
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||||
|
not specified.
|
||||||
|
"""
|
||||||
|
text = "Spacy is a python library for nlp"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||||
|
nlp_reloaded = English()
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "entityruler"
|
||||||
|
ruler.to_disk(file_path)
|
||||||
|
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||||
|
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||||
|
doc_reloaded = nlp_reloaded(text)
|
||||||
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||||
|
assert res == res_reloaded
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4665():
|
||||||
|
"""
|
||||||
|
conllu2json should not raise an exception if the HEAD column contains an
|
||||||
|
underscore
|
||||||
|
"""
|
||||||
|
input_data = """
|
||||||
|
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||||
|
2 This _ DET DT _ _ det _ _
|
||||||
|
3 killing _ NOUN NN _ _ nsubj _ _
|
||||||
|
4 of _ ADP IN _ _ case _ _
|
||||||
|
5 a _ DET DT _ _ det _ _
|
||||||
|
6 respected _ ADJ JJ _ _ amod _ _
|
||||||
|
7 cleric _ NOUN NN _ _ nmod _ _
|
||||||
|
8 will _ AUX MD _ _ aux _ _
|
||||||
|
9 be _ AUX VB _ _ aux _ _
|
||||||
|
10 causing _ VERB VBG _ _ root _ _
|
||||||
|
11 us _ PRON PRP _ _ iobj _ _
|
||||||
|
12 trouble _ NOUN NN _ _ dobj _ _
|
||||||
|
13 for _ ADP IN _ _ case _ _
|
||||||
|
14 years _ NOUN NNS _ _ nmod _ _
|
||||||
|
15 to _ PART TO _ _ mark _ _
|
||||||
|
16 come _ VERB VB _ _ acl _ _
|
||||||
|
17 . _ PUNCT . _ _ punct _ _
|
||||||
|
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||||
|
"""
|
||||||
|
conllu2docs(input_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4674():
|
||||||
|
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||||
|
nlp = English()
|
||||||
|
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||||
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
kb.set_entities(
|
||||||
|
entity_list=["Q1", "Q1"],
|
||||||
|
freq_list=[32, 111],
|
||||||
|
vector_list=[vector1, vector2],
|
||||||
|
)
|
||||||
|
assert kb.get_size_entities() == 1
|
||||||
|
# dumping to file & loading back in
|
||||||
|
with make_tempdir() as d:
|
||||||
|
dir_path = ensure_path(d)
|
||||||
|
if not dir_path.exists():
|
||||||
|
dir_path.mkdir()
|
||||||
|
file_path = dir_path / "kb"
|
||||||
|
kb.dump(str(file_path))
|
||||||
|
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||||
|
kb2.load_bulk(str(file_path))
|
||||||
|
assert kb2.get_size_entities() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4707():
|
||||||
|
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||||
|
by default when loading a model.
|
||||||
|
"""
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||||
|
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||||
|
exclude = ["tokenizer", "sentencizer"]
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
nlp.to_disk(tmpdir, exclude=exclude)
|
||||||
|
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||||
|
assert "sentencizer" not in new_nlp.pipe_names
|
||||||
|
assert "entity_ruler" in new_nlp.pipe_names
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4725_1():
|
||||||
|
""" Ensure the pickling of the NER goes well"""
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||||
|
with make_tempdir() as tmp_path:
|
||||||
|
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||||
|
pickle.dump(ner, file_)
|
||||||
|
assert ner.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||||
|
ner2 = pickle.load(file_)
|
||||||
|
assert ner2.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4725_2():
|
||||||
|
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||||
|
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
data = numpy.ndarray((5, 3), dtype="f")
|
||||||
|
data[0] = 1.0
|
||||||
|
data[1] = 2.0
|
||||||
|
vocab.set_vector("cat", data[0])
|
||||||
|
vocab.set_vector("dog", data[1])
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
docs = ["Kurt is in London."] * 10
|
||||||
|
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4849():
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(
|
||||||
|
nlp,
|
||||||
|
patterns=[
|
||||||
|
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||||
|
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||||
|
],
|
||||||
|
phrase_matcher_attr="LOWER",
|
||||||
|
)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
text = """
|
||||||
|
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||||
|
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||||
|
"""
|
||||||
|
# USING 1 PROCESS
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=1):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert count_ents == 2
|
||||||
|
# USING 2 PROCESSES
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=2):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert count_ents == 2
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPipe:
|
||||||
|
name = "my_pipe"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||||
|
Doc.set_extension("my_ext", default=None)
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
gathered_ext = []
|
||||||
|
for sent in doc.sents:
|
||||||
|
sent_ext = self._get_my_ext(sent)
|
||||||
|
sent._.set("my_ext", sent_ext)
|
||||||
|
gathered_ext.append(sent_ext)
|
||||||
|
|
||||||
|
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_my_ext(span):
|
||||||
|
return str(span.end)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4903():
|
||||||
|
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
|
||||||
|
macOS."""
|
||||||
|
nlp = English()
|
||||||
|
custom_component = CustomPipe()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(custom_component, after="sentencizer")
|
||||||
|
|
||||||
|
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||||
|
docs = list(nlp.pipe(text, n_process=2))
|
||||||
|
assert docs[0].text == "I like bananas."
|
||||||
|
assert docs[1].text == "Do you like them?"
|
||||||
|
assert docs[2].text == "No, I prefer wasabi."
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4924():
|
||||||
|
nlp = Language()
|
||||||
|
example = Example.from_dict(nlp.make_doc(""), {})
|
||||||
|
nlp.evaluate([example])
|
|
@ -1,16 +0,0 @@
|
||||||
from spacy.tokens import Doc, DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4528(en_vocab):
|
|
||||||
"""Test that user_data is correctly serialized in DocBin."""
|
|
||||||
doc = Doc(en_vocab, words=["hello", "world"])
|
|
||||||
doc.user_data["foo"] = "bar"
|
|
||||||
# This is how extension attribute values are stored in the user data
|
|
||||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
|
||||||
doc_bin = DocBin(store_user_data=True)
|
|
||||||
doc_bin.add(doc)
|
|
||||||
doc_bin_bytes = doc_bin.to_bytes()
|
|
||||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
|
||||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
|
||||||
assert new_doc.user_data["foo"] == "bar"
|
|
||||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
|
|
@ -1,11 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
|
||||||
)
|
|
||||||
def test_gold_misaligned(en_tokenizer, text, words):
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
Example.from_dict(doc, {"words": words})
|
|
|
@ -1,35 +0,0 @@
|
||||||
from mock import Mock
|
|
||||||
from spacy.matcher import DependencyMatcher
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4590(en_vocab):
|
|
||||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
|
||||||
pattern = [
|
|
||||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
|
||||||
{
|
|
||||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
|
||||||
"PATTERN": {"ORTH": "fox"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
|
||||||
"PATTERN": {"ORTH": "fox"},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
on_match = Mock()
|
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
|
||||||
matcher.add("pattern", on_match, pattern)
|
|
||||||
|
|
||||||
text = "The quick brown fox jumped over the lazy fox"
|
|
||||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
|
||||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
|
||||||
|
|
||||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
|
||||||
|
|
||||||
matches = matcher(doc)
|
|
||||||
|
|
||||||
on_match_args = on_match.call_args
|
|
||||||
|
|
||||||
assert on_match_args[0][3] == matches
|
|
|
@ -1,62 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4651_with_phrase_matcher_attr():
|
|
||||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
|
||||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
||||||
specified.
|
|
||||||
"""
|
|
||||||
text = "Spacy is a python library for nlp"
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
|
||||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
||||||
|
|
||||||
nlp_reloaded = English()
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "entityruler"
|
|
||||||
ruler.to_disk(file_path)
|
|
||||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
|
||||||
|
|
||||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
|
||||||
doc_reloaded = nlp_reloaded(text)
|
|
||||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
||||||
|
|
||||||
assert res == res_reloaded
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4651_without_phrase_matcher_attr():
|
|
||||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
|
||||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
||||||
not specified.
|
|
||||||
"""
|
|
||||||
text = "Spacy is a python library for nlp"
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
||||||
|
|
||||||
nlp_reloaded = English()
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "entityruler"
|
|
||||||
ruler.to_disk(file_path)
|
|
||||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
|
||||||
|
|
||||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
|
||||||
doc_reloaded = nlp_reloaded(text)
|
|
||||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
||||||
|
|
||||||
assert res == res_reloaded
|
|
|
@ -1,35 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
|
||||||
|
|
||||||
input_data = """
|
|
||||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
|
||||||
2 This _ DET DT _ _ det _ _
|
|
||||||
3 killing _ NOUN NN _ _ nsubj _ _
|
|
||||||
4 of _ ADP IN _ _ case _ _
|
|
||||||
5 a _ DET DT _ _ det _ _
|
|
||||||
6 respected _ ADJ JJ _ _ amod _ _
|
|
||||||
7 cleric _ NOUN NN _ _ nmod _ _
|
|
||||||
8 will _ AUX MD _ _ aux _ _
|
|
||||||
9 be _ AUX VB _ _ aux _ _
|
|
||||||
10 causing _ VERB VBG _ _ root _ _
|
|
||||||
11 us _ PRON PRP _ _ iobj _ _
|
|
||||||
12 trouble _ NOUN NN _ _ dobj _ _
|
|
||||||
13 for _ ADP IN _ _ case _ _
|
|
||||||
14 years _ NOUN NNS _ _ nmod _ _
|
|
||||||
15 to _ PART TO _ _ mark _ _
|
|
||||||
16 come _ VERB VB _ _ acl _ _
|
|
||||||
17 . _ PUNCT . _ _ punct _ _
|
|
||||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue4665():
|
|
||||||
"""
|
|
||||||
conllu2json should not raise an exception if the HEAD column contains an
|
|
||||||
underscore
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
# conllu2json(input_data)
|
|
|
@ -1,36 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4674():
|
|
||||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
|
||||||
nlp = English()
|
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
|
||||||
|
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
|
||||||
with pytest.warns(UserWarning):
|
|
||||||
kb.set_entities(
|
|
||||||
entity_list=["Q1", "Q1"],
|
|
||||||
freq_list=[32, 111],
|
|
||||||
vector_list=[vector1, vector2],
|
|
||||||
)
|
|
||||||
|
|
||||||
assert kb.get_size_entities() == 1
|
|
||||||
|
|
||||||
# dumping to file & loading back in
|
|
||||||
with make_tempdir() as d:
|
|
||||||
dir_path = ensure_path(d)
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir()
|
|
||||||
file_path = dir_path / "kb"
|
|
||||||
kb.dump(str(file_path))
|
|
||||||
|
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
|
||||||
kb2.load_bulk(str(file_path))
|
|
||||||
|
|
||||||
assert kb2.get_size_entities() == 1
|
|
|
@ -1,20 +0,0 @@
|
||||||
from spacy.util import load_model_from_path
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4707():
|
|
||||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
|
||||||
by default when loading a model.
|
|
||||||
"""
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
|
||||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
|
||||||
exclude = ["tokenizer", "sentencizer"]
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
nlp.to_disk(tmpdir, exclude=exclude)
|
|
||||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
|
||||||
assert "sentencizer" not in new_nlp.pipe_names
|
|
||||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
|
@ -1,41 +0,0 @@
|
||||||
import pickle
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
from spacy.tests.util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_pickle_ner():
|
|
||||||
""" Ensure the pickling of the NER goes well"""
|
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
|
||||||
nlp = English(vocab=vocab)
|
|
||||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
|
||||||
with make_tempdir() as tmp_path:
|
|
||||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
|
||||||
pickle.dump(ner, file_)
|
|
||||||
assert ner.cfg["min_action_freq"] == 342
|
|
||||||
|
|
||||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
|
||||||
ner2 = pickle.load(file_)
|
|
||||||
assert ner2.cfg["min_action_freq"] == 342
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4725():
|
|
||||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
|
||||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
|
||||||
data = numpy.ndarray((5, 3), dtype="f")
|
|
||||||
data[0] = 1.0
|
|
||||||
data[1] = 2.0
|
|
||||||
vocab.set_vector("cat", data[0])
|
|
||||||
vocab.set_vector("dog", data[1])
|
|
||||||
|
|
||||||
nlp = English(vocab=vocab)
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
docs = ["Kurt is in London."] * 10
|
|
||||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
|
||||||
pass
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4849():
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
ruler = EntityRuler(
|
|
||||||
nlp,
|
|
||||||
patterns=[
|
|
||||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
|
||||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
|
||||||
],
|
|
||||||
phrase_matcher_attr="LOWER",
|
|
||||||
)
|
|
||||||
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
text = """
|
|
||||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
|
||||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
|
||||||
"""
|
|
||||||
|
|
||||||
# USING 1 PROCESS
|
|
||||||
count_ents = 0
|
|
||||||
for doc in nlp.pipe([text], n_process=1):
|
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
|
||||||
assert count_ents == 2
|
|
||||||
|
|
||||||
# USING 2 PROCESSES
|
|
||||||
count_ents = 0
|
|
||||||
for doc in nlp.pipe([text], n_process=2):
|
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
|
||||||
assert count_ents == 2
|
|
|
@ -1,40 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span, Doc
|
|
||||||
|
|
||||||
|
|
||||||
class CustomPipe:
|
|
||||||
name = "my_pipe"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
|
||||||
Doc.set_extension("my_ext", default=None)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
gathered_ext = []
|
|
||||||
for sent in doc.sents:
|
|
||||||
sent_ext = self._get_my_ext(sent)
|
|
||||||
sent._.set("my_ext", sent_ext)
|
|
||||||
gathered_ext.append(sent_ext)
|
|
||||||
|
|
||||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_my_ext(span):
|
|
||||||
return str(span.end)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4903():
|
|
||||||
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
custom_component = CustomPipe()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(custom_component, after="sentencizer")
|
|
||||||
|
|
||||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
|
||||||
docs = list(nlp.pipe(text, n_process=2))
|
|
||||||
assert docs[0].text == "I like bananas."
|
|
||||||
assert docs[1].text == "Do you like them?"
|
|
||||||
assert docs[2].text == "No, I prefer wasabi."
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.gold import Example
|
|
||||||
from spacy.language import Language
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4924():
|
|
||||||
nlp = Language()
|
|
||||||
example = Example.from_dict(nlp.make_doc(""), {})
|
|
||||||
nlp.evaluate([example])
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue5152():
|
def test_issue5152():
|
||||||
# Test that the comparison between a Span and a Token, goes well
|
# Test that the comparison between a Span and a Token, goes well
|
||||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||||
|
@ -8,7 +10,6 @@ def test_issue5152():
|
||||||
text = nlp("Talk about being boring!")
|
text = nlp("Talk about being boring!")
|
||||||
text_var = nlp("Talk of being boring!")
|
text_var = nlp("Talk of being boring!")
|
||||||
y = nlp("Let")
|
y = nlp("Let")
|
||||||
|
|
||||||
span = text[0:3] # Talk about being
|
span = text[0:3] # Talk about being
|
||||||
span_2 = text[0:3] # Talk about being
|
span_2 = text[0:3] # Talk about being
|
||||||
span_3 = text_var[0:3] # Talk of being
|
span_3 = text_var[0:3] # Talk of being
|
||||||
|
|
|
@ -63,7 +63,8 @@ def tagger():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
tagger.begin_training(pipeline=nlp.pipeline)
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training(pipeline=nlp.pipeline)
|
||||||
return tagger
|
return tagger
|
||||||
|
|
||||||
|
|
||||||
|
|
31
spacy/tests/regression/test_issue5551.py
Normal file
31
spacy/tests/regression/test_issue5551.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.util import fix_random_seed
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue5551():
|
||||||
|
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
|
||||||
|
component = "textcat"
|
||||||
|
pipe_cfg = {"exclusive_classes": False}
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i in range(3):
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp = English()
|
||||||
|
example = (
|
||||||
|
"Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
|
||||||
|
{"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
|
||||||
|
)
|
||||||
|
nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True)
|
||||||
|
pipe = nlp.get_pipe(component)
|
||||||
|
for label in set(example[1]["cats"]):
|
||||||
|
pipe.add_label(label)
|
||||||
|
nlp.begin_training(component_cfg={component: pipe_cfg})
|
||||||
|
|
||||||
|
# Store the result of each iteration
|
||||||
|
result = pipe.model.predict([nlp.make_doc(example[0])])
|
||||||
|
results.append(list(result[0]))
|
||||||
|
|
||||||
|
# All results should be the same because of the fixed seed
|
||||||
|
assert len(results) == 3
|
||||||
|
assert results[0] == results[1]
|
||||||
|
assert results[0] == results[2]
|
|
@ -1,3 +1,4 @@
|
||||||
|
import numpy
|
||||||
from spacy.errors import AlignmentError
|
from spacy.errors import AlignmentError
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||||
|
@ -5,6 +6,7 @@ from spacy.gold import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.gold.example import Example
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.gold.converters import json2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding
|
||||||
|
@ -153,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab):
|
||||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_constructor(en_vocab):
|
||||||
|
words = ["I", "like", "stuff"]
|
||||||
|
tags = ["NOUN", "VERB", "NOUN"]
|
||||||
|
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
|
||||||
|
predicted = Doc(en_vocab, words=words)
|
||||||
|
reference = Doc(en_vocab, words=words)
|
||||||
|
reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
||||||
|
example = Example(predicted, reference)
|
||||||
|
tags = example.get_aligned("TAG", as_string=True)
|
||||||
|
assert tags == ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_from_dict_tags(en_vocab):
|
||||||
|
words = ["I", "like", "stuff"]
|
||||||
|
tags = ["NOUN", "VERB", "NOUN"]
|
||||||
|
predicted = Doc(en_vocab, words=words)
|
||||||
|
example = Example.from_dict(predicted, {"TAGS": tags})
|
||||||
|
tags = example.get_aligned("TAG", as_string=True)
|
||||||
|
assert tags == ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
||||||
|
|
||||||
def test_example_from_dict_no_ner(en_vocab):
|
def test_example_from_dict_no_ner(en_vocab):
|
||||||
words = ["a", "b", "c", "d"]
|
words = ["a", "b", "c", "d"]
|
||||||
spaces = [True, True, False, True]
|
spaces = [True, True, False, True]
|
||||||
|
@ -272,72 +295,72 @@ def test_split_sentences(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
spaces = [True, True, True, False, False]
|
spaces = [True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person
|
(len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
@ -407,6 +430,49 @@ def test_biluo_spans(en_tokenizer):
|
||||||
assert spans[1].label_ == "GPE"
|
assert spans[1].label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
|
||||||
|
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
|
||||||
|
spaces = [True, True, True, False, False]
|
||||||
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
ents_ref = example.reference.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
|
||||||
|
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
|
||||||
|
text = "Mr and Mrs Smith flew to San Francisco Valley"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
|
||||||
|
{"label": "LOC", "pattern": "San Francisco Valley"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
|
||||||
|
ents_pred = example.predicted.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
|
||||||
|
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_ner_missing_tags(en_tokenizer):
|
def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||||
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
|
@ -414,6 +480,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_projectivize(en_tokenizer):
|
||||||
|
doc = en_tokenizer("He pretty quickly walks away")
|
||||||
|
heads = [3, 2, 3, 0, 2]
|
||||||
|
example = Example.from_dict(doc, {"heads": heads})
|
||||||
|
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
|
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||||
|
assert proj_heads == [3, 2, 3, 0, 3]
|
||||||
|
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||||
|
|
||||||
|
|
||||||
def test_iob_to_biluo():
|
def test_iob_to_biluo():
|
||||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||||
|
|
156
spacy/tests/test_models.py
Normal file
156
spacy/tests/test_models.py
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
||||||
|
from numpy.testing import assert_array_equal
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from spacy.ml.models import build_Tok2Vec_model
|
||||||
|
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_params(model):
|
||||||
|
params = []
|
||||||
|
for node in model.walk():
|
||||||
|
for name in node.param_names:
|
||||||
|
params.append(node.get_param(name).ravel())
|
||||||
|
return node.ops.xp.concatenate(params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_docs():
|
||||||
|
nlp = English()
|
||||||
|
return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)]))
|
||||||
|
|
||||||
|
|
||||||
|
def get_gradient(model, Y):
|
||||||
|
if isinstance(Y, model.ops.xp.ndarray):
|
||||||
|
dY = model.ops.alloc(Y.shape, dtype=Y.dtype)
|
||||||
|
dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape)
|
||||||
|
return dY
|
||||||
|
elif isinstance(Y, List):
|
||||||
|
return [get_gradient(model, y) for y in Y]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Could not get gradient for type {type(Y)}")
|
||||||
|
|
||||||
|
|
||||||
|
def default_tok2vec():
|
||||||
|
return build_Tok2Vec_model(**TOK2VEC_KWARGS)
|
||||||
|
|
||||||
|
|
||||||
|
TOK2VEC_KWARGS = {
|
||||||
|
"width": 96,
|
||||||
|
"embed_size": 2000,
|
||||||
|
"subword_features": True,
|
||||||
|
"char_embed": False,
|
||||||
|
"conv_depth": 4,
|
||||||
|
"bilstm_depth": 0,
|
||||||
|
"maxout_pieces": 4,
|
||||||
|
"window_size": 1,
|
||||||
|
"dropout": 0.1,
|
||||||
|
"nM": 0,
|
||||||
|
"nC": 0,
|
||||||
|
"pretrained_vectors": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
TEXTCAT_KWARGS = {
|
||||||
|
"width": 64,
|
||||||
|
"embed_size": 2000,
|
||||||
|
"pretrained_vectors": None,
|
||||||
|
"exclusive_classes": False,
|
||||||
|
"ngram_size": 1,
|
||||||
|
"window_size": 1,
|
||||||
|
"conv_depth": 2,
|
||||||
|
"dropout": None,
|
||||||
|
"nO": 7
|
||||||
|
}
|
||||||
|
|
||||||
|
TEXTCAT_CNN_KWARGS = {
|
||||||
|
"tok2vec": default_tok2vec(),
|
||||||
|
"exclusive_classes": False,
|
||||||
|
"nO": 13,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"seed,model_func,kwargs",
|
||||||
|
[
|
||||||
|
(0, build_Tok2Vec_model, TOK2VEC_KWARGS),
|
||||||
|
(0, build_text_classifier, TEXTCAT_KWARGS),
|
||||||
|
(0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_models_initialize_consistently(seed, model_func, kwargs):
|
||||||
|
fix_random_seed(seed)
|
||||||
|
model1 = model_func(**kwargs)
|
||||||
|
model1.initialize()
|
||||||
|
fix_random_seed(seed)
|
||||||
|
model2 = model_func(**kwargs)
|
||||||
|
model2.initialize()
|
||||||
|
params1 = get_all_params(model1)
|
||||||
|
params2 = get_all_params(model2)
|
||||||
|
assert_array_equal(params1, params2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"seed,model_func,kwargs,get_X",
|
||||||
|
[
|
||||||
|
(0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
|
||||||
|
(0, build_text_classifier, TEXTCAT_KWARGS, get_docs),
|
||||||
|
(0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_models_predict_consistently(seed, model_func, kwargs, get_X):
|
||||||
|
fix_random_seed(seed)
|
||||||
|
model1 = model_func(**kwargs).initialize()
|
||||||
|
Y1 = model1.predict(get_X())
|
||||||
|
fix_random_seed(seed)
|
||||||
|
model2 = model_func(**kwargs).initialize()
|
||||||
|
Y2 = model2.predict(get_X())
|
||||||
|
|
||||||
|
if model1.has_ref("tok2vec"):
|
||||||
|
tok2vec1 = model1.get_ref("tok2vec").predict(get_X())
|
||||||
|
tok2vec2 = model2.get_ref("tok2vec").predict(get_X())
|
||||||
|
for i in range(len(tok2vec1)):
|
||||||
|
for j in range(len(tok2vec1[i])):
|
||||||
|
assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j]))
|
||||||
|
|
||||||
|
if isinstance(Y1, numpy.ndarray):
|
||||||
|
assert_array_equal(Y1, Y2)
|
||||||
|
elif isinstance(Y1, List):
|
||||||
|
assert len(Y1) == len(Y2)
|
||||||
|
for y1, y2 in zip(Y1, Y2):
|
||||||
|
assert_array_equal(y1, y2)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Could not compare type {type(Y1)}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"seed,dropout,model_func,kwargs,get_X",
|
||||||
|
[
|
||||||
|
(0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
|
||||||
|
(0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs),
|
||||||
|
(0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
||||||
|
def get_updated_model():
|
||||||
|
fix_random_seed(seed)
|
||||||
|
optimizer = Adam(0.001)
|
||||||
|
model = model_func(**kwargs).initialize()
|
||||||
|
initial_params = get_all_params(model)
|
||||||
|
set_dropout_rate(model, dropout)
|
||||||
|
for _ in range(5):
|
||||||
|
Y, get_dX = model.begin_update(get_X())
|
||||||
|
dY = get_gradient(model, Y)
|
||||||
|
_ = get_dX(dY)
|
||||||
|
model.finish_update(optimizer)
|
||||||
|
updated_params = get_all_params(model)
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
assert_array_equal(initial_params, updated_params)
|
||||||
|
return model
|
||||||
|
|
||||||
|
model1 = get_updated_model()
|
||||||
|
model2 = get_updated_model()
|
||||||
|
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
31
spacy/tests/test_projects.py
Normal file
31
spacy/tests/test_projects.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.cli.project.util import validate_project_commands
|
||||||
|
from spacy.schemas import ProjectConfigSchema, validate
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"config",
|
||||||
|
[
|
||||||
|
{"commands": [{"name": "a"}, {"name": "a"}]},
|
||||||
|
{"commands": [{"name": "a"}], "workflows": {"a": []}},
|
||||||
|
{"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_project_config_validation1(config):
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
validate_project_commands(config)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"config,n_errors",
|
||||||
|
[
|
||||||
|
({"commands": {"a": []}}, 1),
|
||||||
|
({"commands": [{"help": "..."}]}, 1),
|
||||||
|
({"commands": [{"name": "a", "extra": "b"}]}, 1),
|
||||||
|
({"commands": [{"extra": "b"}]}, 2),
|
||||||
|
({"commands": [{"name": "a", "deps": [123]}]}, 1),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_project_config_validation2(config, n_errors):
|
||||||
|
errors = validate(ProjectConfigSchema, config)
|
||||||
|
assert len(errors) == n_errors
|
|
@ -803,7 +803,7 @@ cdef class Doc:
|
||||||
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||||
for id_ in attrs]
|
for id_ in attrs]
|
||||||
if array.dtype != numpy.uint64:
|
if array.dtype != numpy.uint64:
|
||||||
warnings.warn(Warnings.W101.format(type=array.dtype))
|
warnings.warn(Warnings.W028.format(type=array.dtype))
|
||||||
|
|
||||||
if SENT_START in attrs and HEAD in attrs:
|
if SENT_START in attrs and HEAD in attrs:
|
||||||
raise ValueError(Errors.E032)
|
raise ValueError(Errors.E032)
|
||||||
|
|
|
@ -20,7 +20,6 @@ import subprocess
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import hashlib
|
|
||||||
import shlex
|
import shlex
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -449,6 +448,16 @@ def split_command(command: str) -> List[str]:
|
||||||
return shlex.split(command, posix=not is_windows)
|
return shlex.split(command, posix=not is_windows)
|
||||||
|
|
||||||
|
|
||||||
|
def join_command(command: List[str]) -> str:
|
||||||
|
"""Join a command using shlex. shlex.join is only available for Python 3.8+,
|
||||||
|
so we're using a workaround here.
|
||||||
|
|
||||||
|
command (List[str]): The command to join.
|
||||||
|
RETURNS (str): The joined command
|
||||||
|
"""
|
||||||
|
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||||
|
|
||||||
|
|
||||||
def run_command(command: Union[str, List[str]]) -> None:
|
def run_command(command: Union[str, List[str]]) -> None:
|
||||||
"""Run a command on the command line as a subprocess. If the subprocess
|
"""Run a command on the command line as a subprocess. If the subprocess
|
||||||
returns a non-zero exit code, a system exit is performed.
|
returns a non-zero exit code, a system exit is performed.
|
||||||
|
@ -501,23 +510,13 @@ def make_tempdir():
|
||||||
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data) -> str:
|
def is_cwd(path: Union[Path, str]) -> bool:
|
||||||
"""Get the hash for a JSON-serializable object.
|
"""Check whether a path is the current working directory.
|
||||||
|
|
||||||
data: The data to hash.
|
path (Union[Path, str]): The directory path.
|
||||||
RETURNS (str): The hash.
|
RETURNS (bool): Whether the path is the current working directory.
|
||||||
"""
|
"""
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
|
||||||
return hashlib.md5(data_str).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_checksum(path: Union[Path, str]) -> str:
|
|
||||||
"""Get the checksum for a file given its file path.
|
|
||||||
|
|
||||||
path (Union[Path, str]): The file path.
|
|
||||||
RETURNS (str): The checksum.
|
|
||||||
"""
|
|
||||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter():
|
def is_in_jupyter():
|
||||||
|
@ -722,6 +721,51 @@ def minibatch(items, size=8):
|
||||||
yield list(batch)
|
yield list(batch)
|
||||||
|
|
||||||
|
|
||||||
|
def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False):
|
||||||
|
if isinstance(size, int):
|
||||||
|
size_ = itertools.repeat(size)
|
||||||
|
else:
|
||||||
|
size_ = size
|
||||||
|
for outer_batch in minibatch(docs, buffer):
|
||||||
|
outer_batch = list(outer_batch)
|
||||||
|
target_size = next(size_)
|
||||||
|
for indices in _batch_by_length(outer_batch, target_size):
|
||||||
|
subbatch = [outer_batch[i] for i in indices]
|
||||||
|
padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
|
||||||
|
if discard_oversize and padded_size >= target_size:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
yield subbatch
|
||||||
|
|
||||||
|
|
||||||
|
def _batch_by_length(seqs, max_words):
|
||||||
|
"""Given a list of sequences, return a batched list of indices into the
|
||||||
|
list, where the batches are grouped by length, in descending order.
|
||||||
|
|
||||||
|
Batches may be at most max_words in size, defined as max sequence length * size.
|
||||||
|
"""
|
||||||
|
# Use negative index so we can get sort by position ascending.
|
||||||
|
lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
|
||||||
|
lengths_indices.sort()
|
||||||
|
batches = []
|
||||||
|
batch = []
|
||||||
|
for length, i in lengths_indices:
|
||||||
|
if not batch:
|
||||||
|
batch.append(i)
|
||||||
|
elif length * (len(batch) + 1) <= max_words:
|
||||||
|
batch.append(i)
|
||||||
|
else:
|
||||||
|
batches.append(batch)
|
||||||
|
batch = [i]
|
||||||
|
if batch:
|
||||||
|
batches.append(batch)
|
||||||
|
# Check lengths match
|
||||||
|
assert sum(len(b) for b in batches) == len(seqs)
|
||||||
|
batches = [list(sorted(batch)) for batch in batches]
|
||||||
|
batches.reverse()
|
||||||
|
return batches
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
||||||
"""Create minibatches of roughly a given number of words. If any examples
|
"""Create minibatches of roughly a given number of words. If any examples
|
||||||
are longer than the specified batch length, they will appear in a batch by
|
are longer than the specified batch length, they will appear in a batch by
|
||||||
|
@ -768,7 +812,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
||||||
|
|
||||||
# yield the previous batch and start a new one. The new one gets the overflow examples.
|
# yield the previous batch and start a new one. The new one gets the overflow examples.
|
||||||
else:
|
else:
|
||||||
yield batch
|
if batch:
|
||||||
|
yield batch
|
||||||
target_size = next(size_)
|
target_size = next(size_)
|
||||||
tol_size = target_size * tolerance
|
tol_size = target_size * tolerance
|
||||||
batch = overflow
|
batch = overflow
|
||||||
|
@ -788,15 +833,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
||||||
|
|
||||||
# this example does not fit with the previous overflow: start another new batch
|
# this example does not fit with the previous overflow: start another new batch
|
||||||
else:
|
else:
|
||||||
yield batch
|
if batch:
|
||||||
|
yield batch
|
||||||
target_size = next(size_)
|
target_size = next(size_)
|
||||||
tol_size = target_size * tolerance
|
tol_size = target_size * tolerance
|
||||||
batch = [doc]
|
batch = [doc]
|
||||||
batch_size = n_words
|
batch_size = n_words
|
||||||
|
|
||||||
# yield the final batch
|
batch.extend(overflow)
|
||||||
if batch:
|
if batch:
|
||||||
batch.extend(overflow)
|
|
||||||
yield batch
|
yield batch
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library
|
||||||
source: spacy/ml/models
|
source: spacy/ml/models
|
||||||
---
|
---
|
||||||
|
|
||||||
TODO: write
|
TODO: intro and how architectures work, link to
|
||||||
|
[`registry`](/api/top-level#registry),
|
||||||
|
[custom models](/usage/training#custom-models) usage etc.
|
||||||
|
|
||||||
|
## Parser architectures {source="spacy/ml/models/parser.py"}
|
||||||
|
|
||||||
|
### spacy.TransitionBasedParser.v1
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
> nr_feature_tokens = 6
|
||||||
|
> hidden_width = 64
|
||||||
|
> maxout_pieces = 2
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------------- | ------------------------------------------ | ----------- |
|
||||||
|
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||||
|
| `nr_feature_tokens` | int | |
|
||||||
|
| `hidden_width` | int | |
|
||||||
|
| `maxout_pieces` | int | |
|
||||||
|
| `use_upper` | bool | |
|
||||||
|
| `nO` | int | |
|
||||||
|
|
|
@ -297,60 +297,41 @@ will not be available.
|
||||||
|
|
||||||
## Train {#train}
|
## Train {#train}
|
||||||
|
|
||||||
<!-- TODO: document new training -->
|
|
||||||
|
|
||||||
Train a model. Expects data in spaCy's
|
Train a model. Expects data in spaCy's
|
||||||
[JSON format](/api/data-formats#json-input). On each epoch, a model will be
|
[binary format](/api/data-formats#training) and a
|
||||||
saved out to the directory. Accuracy scores and model details will be added to a
|
[config file](/api/data-formats#config) with all settings and hyperparameters.
|
||||||
[`meta.json`](/usage/training#models-generating) to allow packaging the model
|
Will save out the best model from all epochs, as well as the final model. The
|
||||||
using the [`package`](/api/cli#package) command.
|
`--code` argument can be used to provide a Python file that's imported before
|
||||||
|
the training process starts. This lets you register
|
||||||
|
[custom functions](/usage/training#custom-models) and architectures and refer to
|
||||||
|
them in your config, all while still using spaCy's built-in `train` workflow. If
|
||||||
|
you need to manage complex multi-step training workflows, check out the new
|
||||||
|
[spaCy projects](/usage/projects).
|
||||||
|
|
||||||
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
As of spaCy v3.0, the `train` command doesn't take a long list of command-line
|
||||||
|
arguments anymore and instead expects a single
|
||||||
|
[`config.cfg` file](/usage/training#config) containing all settings for the
|
||||||
|
pipeline, training process and hyperparameters.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
$ python -m spacy train [train_path] [dev_path] [config_path] [--output]
|
||||||
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping]
|
[--code] [--verbose]
|
||||||
[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
|
|
||||||
[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
|
|
||||||
[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel]
|
|
||||||
[--textcat-positive-label] [--verbose]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | positional | Model language. |
|
| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||||
| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. |
|
||||||
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
|
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||||
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
|
| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. |
|
||||||
| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. |
|
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||||
| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
|
| `--verbose`, `-V` | flag | Show more detailed messages during training. |
|
||||||
| `--replace-components`, `-R` | flag | Replace components from the base model. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| `--vectors`, `-v` | option | Model to load vectors from. |
|
| **CREATES** | model | The final model and the best model. |
|
||||||
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
|
|
||||||
| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
|
|
||||||
| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). |
|
|
||||||
| `--use-gpu`, `-g` | option | GPU ID or `-1` for CPU only (default: `-1`). |
|
|
||||||
| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. |
|
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
|
|
||||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
|
||||||
| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
|
|
||||||
| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
|
|
||||||
| `--width`, `-cw` <Tag variant="new">2.2.4</Tag> | option | Width of CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag> | option | Depth of CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag> | option | Window size for CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag> | option | Maxout size for CNN layers of `Tok2Vec` component. |
|
|
||||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag> | flag | Whether to use character-based embedding of `Tok2Vec` component. |
|
|
||||||
| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag> | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). |
|
|
||||||
| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag> | option | Number of embedding rows of `Tok2Vec` component. |
|
|
||||||
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
|
|
||||||
| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag> | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
|
|
||||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
|
||||||
| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
|
|
||||||
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
|
|
||||||
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
|
|
||||||
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
|
|
||||||
| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option | Location of JSON-formatted tag map. |
|
|
||||||
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
|
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
|
||||||
| **CREATES** | model, pickle | A spaCy model on each epoch. |
|
|
||||||
|
|
||||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||||
|
|
||||||
|
@ -471,20 +452,20 @@ as separate files if the respective component is present in the model's
|
||||||
pipeline.
|
pipeline.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit]
|
$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
|
||||||
[--gpu-id] [--gold-preproc] [--return-scores]
|
[--displacy-limit] [--gpu-id] [--gold-preproc]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
|
| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. |
|
||||||
| `data_path` | positional | Location of JSON-formatted evaluation data. |
|
| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). |
|
||||||
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. |
|
||||||
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. |
|
||||||
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
|
||||||
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. |
|
||||||
| `--return-scores`, `-R` | flag | Return dict containing model scores. |
|
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
|
||||||
| **CREATES** | `stdout`, HTML | Training results and optional displaCy visualizations. |
|
| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. |
|
||||||
|
|
||||||
## Package {#package}
|
## Package {#package}
|
||||||
|
|
||||||
|
@ -504,15 +485,17 @@ so you don't have to run `python setup.py sdist` separately anymore.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
|
[--version] [--force]
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
> #### Example
|
||||||
### Example
|
>
|
||||||
python -m spacy package /input /output
|
> ```bash
|
||||||
cd /output/en_model-0.0.0
|
> python -m spacy package /input /output
|
||||||
pip install dist/en_model-0.0.0.tar.gz
|
> cd /output/en_model-0.0.0
|
||||||
```
|
> pip install dist/en_model-0.0.0.tar.gz
|
||||||
|
> ```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -525,18 +508,137 @@ pip install dist/en_model-0.0.0.tar.gz
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| **CREATES** | directory | A Python package containing the spaCy model. |
|
| **CREATES** | directory | A Python package containing the spaCy model. |
|
||||||
|
|
||||||
## Project {#project}
|
## Project {#project new="3"}
|
||||||
|
|
||||||
<!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design -->
|
The `spacy project` CLI includes subcommands for working with
|
||||||
|
[spaCy projects](/usage/projects), end-to-end workflows for building and
|
||||||
|
deploying custom spaCy models.
|
||||||
|
|
||||||
### project clone {#project-clone}
|
### project clone {#project-clone}
|
||||||
|
|
||||||
|
Clone a project template from a Git repository. Calls into `git` under the hood
|
||||||
|
and uses the sparse checkout feature, so you're only downloading what you need.
|
||||||
|
By default, spaCy's
|
||||||
|
[project templates repo](https://github.com/explosion/projects) is used, but you
|
||||||
|
can provide any other repo (public or private) that you have access to using the
|
||||||
|
`--repo` option.
|
||||||
|
|
||||||
|
<!-- TODO: update example once we've decided on repo structure -->
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project clone [name] [dest] [--repo]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project clone some_example
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> Clone from custom repo:
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
|
||||||
|
| `dest` | positional | Where to clone the project. Defaults to current working directory. |
|
||||||
|
| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). |
|
||||||
|
|
||||||
### project assets {#project-assets}
|
### project assets {#project-assets}
|
||||||
|
|
||||||
### project run-all {#project-run-all}
|
Fetch project assets like datasets and pretrained weights. Assets are defined in
|
||||||
|
the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
|
||||||
|
`checksum` is provided, the file is only downloaded if no local file with the
|
||||||
|
same checksum exists and spaCy will show an error if the checksum of the
|
||||||
|
downloaded file doesn't match. If assets don't specify a `url` they're
|
||||||
|
considered "private" and you have to take care of putting them into the
|
||||||
|
destination directory yourself. If a local path is provided, the asset is copied
|
||||||
|
into the current project.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project assets [project_dir]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project assets
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------------- | ---------- | ----------------------------------------------------------------- |
|
||||||
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. |
|
||||||
|
|
||||||
### project run {#project-run}
|
### project run {#project-run}
|
||||||
|
|
||||||
### project init {#project-init}
|
Run a named command or workflow defined in the
|
||||||
|
[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
|
||||||
|
all commands in the workflow are run, in order. If commands define
|
||||||
|
[dependencies or outputs](/usage/projects#deps-outputs), they will only be
|
||||||
|
re-run if state has changed. For example, if the input dataset changes, a
|
||||||
|
preprocessing command that depends on those files will be re-run.
|
||||||
|
|
||||||
### project update-dvc {#project-update-dvc}
|
```bash
|
||||||
|
$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project run train
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| --------------- | ---------- | ----------------------------------------------------------------- |
|
||||||
|
| `subcommand` | positional | Name of the command or workflow to run. |
|
||||||
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
|
| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. |
|
||||||
|
| `--dry`, `-D` | flag | Perform a dry run and don't execute scripts. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
|
||||||
|
### project dvc {#project-dvc}
|
||||||
|
|
||||||
|
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
||||||
|
[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
|
||||||
|
the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
|
||||||
|
so you need to specify one workflow defined in the
|
||||||
|
[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the
|
||||||
|
first defined workflow is used. The DVC config will only be updated if the
|
||||||
|
`project.yml` changed. For details, see the
|
||||||
|
[DVC integration](/usage/projects#dvc) docs.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
This command requires DVC to be installed and initialized in the project
|
||||||
|
directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
|
||||||
|
You'll also need to add the assets you want to track with
|
||||||
|
[`dvc add`](https://dvc.org/doc/command-reference/add).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> git init
|
||||||
|
> dvc init
|
||||||
|
> python -m spacy project dvc all
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ----------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||||
|
| `project_dir` | positional | Path to project directory. Defaults to current working directory. |
|
||||||
|
| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
|
||||||
|
| `--force`, `-F` | flag | Force-updating config file. |
|
||||||
|
| `--verbose`, `-V` | flag | Print more output generated by DVC. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
|
|
@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to
|
||||||
follow — only to succumb themselves. In short, just say no to optimizing your
|
follow — only to succumb themselves. In short, just say no to optimizing your
|
||||||
Python. If it's not fast enough the first time, just switch to Cython.
|
Python. If it's not fast enough the first time, just switch to Cython.
|
||||||
|
|
||||||
<Infobox title="📖 Resources">
|
<Infobox title="Resources" emoji="📖">
|
||||||
|
|
||||||
- [Official Cython documentation](http://docs.cython.org/en/latest/)
|
- [Official Cython documentation](http://docs.cython.org/en/latest/)
|
||||||
(cython.org)
|
(cython.org)
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
title: Data formats
|
title: Data formats
|
||||||
teaser: Details on spaCy's input and output data formats
|
teaser: Details on spaCy's input and output data formats
|
||||||
menu:
|
menu:
|
||||||
- ['Training data', 'training']
|
- ['Training Data', 'training']
|
||||||
|
- ['Training Config', 'config']
|
||||||
- ['Vocabulary', 'vocab']
|
- ['Vocabulary', 'vocab']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank:
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
|
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Training config {#config new="3"}
|
||||||
|
|
||||||
|
Config files define the training process and model pipeline and can be passed to
|
||||||
|
[`spacy train`](/api/cli#train). They use
|
||||||
|
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||||
|
hood. For details on how to use training configs, see the
|
||||||
|
[usage documentation](/usage/training#config).
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
The `@` syntax lets you refer to function names registered in the
|
||||||
|
[function registry](/api/top-level#registry). For example,
|
||||||
|
`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
|
||||||
|
the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
|
||||||
|
will be passed into that function as arguments. Those arguments depend on the
|
||||||
|
registered function. See the [model architectures](/api/architectures) docs for
|
||||||
|
API details.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
|
||||||
|
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a model's vocabulary, you can use the
|
To populate a model's vocabulary, you can use the
|
||||||
|
|
|
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"parser"`.
|
via the ID `"parser"`.
|
||||||
|
|
||||||
## DependencyParser.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
learning libraries.
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via create_pipe with default model
|
||||||
|
> parser = nlp.create_pipe("parser")
|
||||||
|
>
|
||||||
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_parser"}}
|
||||||
|
> parser = nlp.create_pipe("parser", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
|
> from spacy.pipeline import DependencyParser
|
||||||
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
|
> parser = DependencyParser(nlp.vocab, model)
|
||||||
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
> #### Example
|
| Name | Type | Description |
|
||||||
>
|
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
||||||
> ```python
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
> # Construction via create_pipe
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
> parser = nlp.create_pipe("parser")
|
| `**cfg` | - | Configuration parameters. |
|
||||||
>
|
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
||||||
> # Construction from class
|
|
||||||
> from spacy.pipeline import DependencyParser
|
|
||||||
> parser = DependencyParser(nlp.vocab)
|
|
||||||
> parser.from_disk("/path/to/model")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `**cfg` | - | Configuration parameters. |
|
|
||||||
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
|
||||||
|
|
||||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## DependencyParser.predict {#predict tag="method"}
|
## DependencyParser.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | ---------------------------------------------- |
|
| ----------- | ------------------- | ---------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
|
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
|
||||||
|
|
||||||
## DependencyParser.set_annotations {#set_annotations tag="method"}
|
## DependencyParser.set_annotations {#set_annotations tag="method"}
|
||||||
|
@ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> parser.set_annotations([doc1, doc2], scores)
|
> parser.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | ---------------------------------------------------------- |
|
| -------- | ------------------- | ---------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `DependencyParser.predict`. |
|
| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. |
|
||||||
|
|
||||||
## DependencyParser.update {#update tag="method"}
|
## DependencyParser.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of documents and gold-standard information, updating the
|
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
|
||||||
pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||||
[`get_loss`](/api/dependencyparser#get_loss).
|
[`get_loss`](/api/dependencyparser#get_loss).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = DependencyParser(nlp.vocab, parser_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = parser.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## DependencyParser.get_loss {#get_loss tag="method"}
|
## DependencyParser.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -156,21 +162,20 @@ predicted scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = DependencyParser(nlp.vocab)
|
||||||
> scores = parser.predict([doc1, doc2])
|
> scores = parser.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores)
|
> loss, d_loss = parser.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
| ----------- | ------------------- | --------------------------------------------------- |
|
||||||
| `docs` | iterable | The batch of documents. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -180,16 +185,17 @@ has been initialized yet, the model is added.
|
||||||
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
|
||||||
|
component.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -198,9 +204,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = parser.create_optimizer()
|
> optimizer = parser.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
|
@ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"entity_linker"`.
|
via the ID `"entity_linker"`.
|
||||||
|
|
||||||
## EntityLinker.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
context encoder. Wrappers are under development for most major machine learning
|
documentation for details on the architectures and their arguments and
|
||||||
libraries.
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via create_pipe with default model
|
||||||
|
> entity_linker = nlp.create_pipe("entity_linker")
|
||||||
|
>
|
||||||
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_el"}}
|
||||||
|
> entity_linker = nlp.create_pipe("entity_linker", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
|
> from spacy.pipeline import EntityLinker
|
||||||
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
|
> entity_linker = EntityLinker(nlp.vocab, model)
|
||||||
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
> #### Example
|
| Name | Type | Description |
|
||||||
>
|
| ------- | ------- | ------------------------------------------------------------------------------- |
|
||||||
> ```python
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
> # Construction via create_pipe
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
> entity_linker = nlp.create_pipe("entity_linker")
|
| `**cfg` | - | Configuration parameters. |
|
||||||
>
|
|
||||||
> # Construction from class
|
|
||||||
> from spacy.pipeline import EntityLinker
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> entity_linker.from_disk("/path/to/model")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
||||||
| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to `128`. |
|
|
||||||
| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to `True`. |
|
|
||||||
| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`. |
|
|
||||||
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## EntityLinker.predict {#predict tag="method"}
|
## EntityLinker.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = EntityLinker(nlp.vocab)
|
||||||
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
|
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------ |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
|
| **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. |
|
||||||
|
|
||||||
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
@ -122,19 +125,18 @@ entities.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = EntityLinker(nlp.vocab)
|
||||||
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
|
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||||
> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
|
> entity_linker.set_annotations([doc1, doc2], kb_ids)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | -------- | ------------------------------------------------------------------------------------------------- |
|
| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
| `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
||||||
| `tensors` | iterable | The token representations used to predict the identifiers. |
|
|
||||||
|
|
||||||
## EntityLinker.update {#update tag="method"}
|
## EntityLinker.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of documents and gold-standard information, updating both the
|
Learn from a batch of [`Example`](/api/example) objects, updating both the
|
||||||
pipe's entity linking model and context encoder. Delegates to
|
pipe's entity linking model and context encoder. Delegates to
|
||||||
[`predict`](/api/entitylinker#predict) and
|
[`predict`](/api/entitylinker#predict) and
|
||||||
[`get_loss`](/api/entitylinker#get_loss).
|
[`get_loss`](/api/entitylinker#get_loss).
|
||||||
|
@ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = EntityLinker(nlp.vocab, nel_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
## EntityLinker.get_loss {#get_loss tag="method"}
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
Find the loss and gradient of loss for the entities in a batch of documents and
|
|
||||||
their predicted scores.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> kb_ids, tensors = entity_linker.predict(docs)
|
|
||||||
> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
|
||||||
| `docs` | iterable | The batch of documents. |
|
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
|
||||||
| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
|
|
||||||
| `tensors` | iterable | The token representations used to predict the identifiers |
|
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## EntityLinker.set_kb {#set_kb tag="method"}
|
## EntityLinker.set_kb {#set_kb tag="method"}
|
||||||
|
|
||||||
|
@ -195,9 +177,9 @@ identifiers.
|
||||||
|
|
||||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added. Before calling this method, a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
|
||||||
knowledge base should have been defined with
|
method, a knowledge base should have been defined with
|
||||||
[`set_kb`](/api/entitylinker#set_kb).
|
[`set_kb`](/api/entitylinker#set_kb).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -209,12 +191,12 @@ knowledge base should have been defined with
|
||||||
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. | |
|
||||||
|
|
||||||
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -227,9 +209,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = entity_linker.create_optimizer()
|
> optimizer = entity_linker.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
|
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
component is available in the [processing pipeline](/usage/processing-pipelines)
|
||||||
via the ID `"ner"`.
|
via the ID `"ner"`.
|
||||||
|
|
||||||
## EntityRecognizer.Model {#model tag="classmethod"}
|
## Default config {#config}
|
||||||
|
|
||||||
Initialize a model for the pipe. The model should implement the
|
This is the default configuration used to initialize the model powering the
|
||||||
`thinc.neural.Model` API. Wrappers are under development for most major machine
|
pipeline component. See the [model architectures](/api/architectures)
|
||||||
learning libraries.
|
documentation for details on the architectures and their arguments and
|
||||||
|
hyperparameters. To learn more about how to customize the config and train
|
||||||
|
custom models, check out the [training config](/usage/training#config) docs.
|
||||||
|
|
||||||
| Name | Type | Description |
|
```python
|
||||||
| ----------- | ------ | ------------------------------------- |
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg
|
||||||
| `**kwargs` | - | Parameters for initializing the model |
|
```
|
||||||
| **RETURNS** | object | The initialized model. |
|
|
||||||
|
|
||||||
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
|
||||||
shortcut for this and instantiate the component using its string name and
|
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via create_pipe
|
||||||
> ner = nlp.create_pipe("ner")
|
> ner = nlp.create_pipe("ner")
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_ner"}}
|
||||||
|
> parser = nlp.create_pipe("ner", config)
|
||||||
|
>
|
||||||
|
> # Construction from class with custom model from file
|
||||||
> from spacy.pipeline import EntityRecognizer
|
> from spacy.pipeline import EntityRecognizer
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
||||||
> ner.from_disk("/path/to/model")
|
> ner = EntityRecognizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
shortcut for this and instantiate the component using its string name and
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
[`nlp.create_pipe`](/api/language#create_pipe).
|
||||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| Name | Type | Description |
|
||||||
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
|
| `**cfg` | - | Configuration parameters. |
|
||||||
|
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
||||||
|
|
||||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------ |
|
| ------------ | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | iterable | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
## EntityRecognizer.predict {#predict tag="method"}
|
## EntityRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab)
|
||||||
> scores, tensors = ner.predict([doc1, doc2])
|
> scores = ner.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to predict. |
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
||||||
|
|
||||||
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
|
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
@ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab)
|
||||||
> scores, tensors = ner.predict([doc1, doc2])
|
> scores = ner.predict([doc1, doc2])
|
||||||
> ner.set_annotations([doc1, doc2], scores, tensors)
|
> ner.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | -------- | ---------------------------------------------------------- |
|
| -------- | ------------------ | ---------------------------------------------------------- |
|
||||||
| `docs` | iterable | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
|
| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. |
|
||||||
| `tensors` | iterable | The token representations used to predict the scores. |
|
|
||||||
|
|
||||||
## EntityRecognizer.update {#update tag="method"}
|
## EntityRecognizer.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of documents and gold-standard information, updating the
|
Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
|
||||||
pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||||
[`get_loss`](/api/entityrecognizer#get_loss).
|
[`get_loss`](/api/entityrecognizer#get_loss).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab, ner_model)
|
||||||
> losses = {}
|
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
> losses = ner.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | -------- | -------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | iterable | A batch of documents to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
|
||||||
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -157,21 +162,20 @@ predicted scores.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = EntityRecognizer(nlp.vocab)
|
||||||
> scores = ner.predict([doc1, doc2])
|
> scores = ner.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores)
|
> loss, d_loss = ner.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------ |
|
| ----------- | ------------------- | --------------------------------------------------- |
|
||||||
| `docs` | iterable | The batch of documents. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
|
| `scores` | `List[StateClass]` | Scores representing the model's predictions. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
|
||||||
|
|
||||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. If no model
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
has been initialized yet, the model is added.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -181,12 +185,12 @@ has been initialized yet, the model is added.
|
||||||
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
|
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
|
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
||||||
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
||||||
| **RETURNS** | callable | An optimizer. |
|
| **RETURNS** | `Optimizer` | An optimizer. |
|
||||||
|
|
||||||
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -199,9 +203,9 @@ Create an optimizer for the pipeline component.
|
||||||
> optimizer = ner.create_optimizer()
|
> optimizer = ner.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | -------------- |
|
| ----------- | ----------- | --------------------------------------------------------------- |
|
||||||
| **RETURNS** | callable | The optimizer. |
|
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
|
|
||||||
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user