Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-13 22:31:22 +02:00
commit ceb850f099
40 changed files with 643 additions and 313 deletions

View File

@ -301,6 +301,7 @@ def ensure_pathy(path):
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
git_version = get_git_version()
if dest.exists():
msg.fail("Destination of checkout must not exist", exits=1)
if not dest.parent.exists():
@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
# *that* we can do by path.
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
git_version = get_git_version()
supports_sparse = git_version >= (2, 22)
# This is the "clone, but don't download anything" part.
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
if supports_sparse:
cmd += f"--filter=blob:none" # <-- The key bit
else:
msg.warn(
err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet. This means that "
f"more files than necessary may be downloaded temporarily. To "
f"only download the files needed, upgrade to Git v2.22 or above."
f"that doesn't fully support sparse checkout yet."
)
_attempt_run_command(cmd)
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
msg.warn(
f"{err_unk if git_version == (0, 0) else err_old} "
f"This means that more files than necessary may be downloaded "
f"temporarily. To only download the files needed, make sure "
f"you're using Git v2.22 or above."
)
try_run_command(cmd)
# Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
ret = _attempt_run_command(cmd)
ret = try_run_command(cmd)
git_repo = _from_http_to_git(repo)
# Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
msg.fail(err, exits=1)
if supports_sparse:
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
_attempt_run_command(cmd)
try_run_command(cmd)
# And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
_attempt_run_command(cmd)
try_run_command(cmd)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
def get_git_version() -> Tuple[int, int]:
ret = _attempt_run_command(["git", "--version"])
# TODO: this seems kinda brittle?
version = ret.stdout[11:].strip().split(".")
def get_git_version(
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
) -> Tuple[int, int]:
"""Get the version of git and raise an error if calling 'git --version' fails.
error (str): The error message to show.
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined.
"""
ret = try_run_command(["git", "--version"], error=error)
stdout = ret.stdout.strip()
if not stdout or not stdout.startswith("git version"):
return (0, 0)
version = stdout[11:].strip().split(".")
return (int(version[0]), int(version[1]))
def _attempt_run_command(cmd: Union[str, List[str]]):
def try_run_command(
cmd: Union[str, List[str]], error: str = "Could not run command"
) -> subprocess.CompletedProcess:
"""Try running a command and raise an error if it fails.
cmd (Union[str, List[str]]): The command to run.
error (str): The error message.
RETURNS (CompletedProcess): The completed process if the command ran.
"""
try:
return run_command(cmd, capture=True)
except subprocess.CalledProcessError as e:
err = f"Could not run command"
msg.fail(err)
msg.fail(error)
print(cmd)
sys.exit(1)
@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str:
return repo
def string_to_list(value, intify=False):
"""Parse a comma-separated string to a list"""
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
"""Parse a comma-separated string to a list and account for various
formatting options. Mostly used to handle CLI arguments that take a list of
comma-separated values.
value (str): The value to parse.
intify (bool): Whether to convert values to ints.
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
"""
if not value:
return []
if value.startswith("[") and value.endswith("]"):

View File

@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list
from .. import util

View File

@ -277,7 +277,7 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
def ensure_shape(lines):
"""Ensure that the first line of the data is the vectors shape.
If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem.
"""

View File

@ -1,10 +1,10 @@
from typing import Optional, Dict, Any
import random
from typing import Optional
import numpy
import time
import re
from collections import Counter
from pathlib import Path
from thinc.api import Config
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance
@ -15,11 +15,10 @@ import typer
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ..errors import Errors
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..tokens import Doc
from ..attrs import ID, HEAD
from ..attrs import ID
from .. import util
@ -30,9 +29,8 @@ from .. import util
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
@ -60,13 +58,35 @@ def pretrain_cli(
DOCS: https://nightly.spacy.io/api/cli#pretrain
"""
overrides = parse_config_overrides(ctx.args)
config_overrides = parse_config_overrides(ctx.args)
import_code(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
config = util.load_config(
config_path,
overrides=config_overrides,
interpolate=True
)
if not config.get("pretraining"):
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
pretrain(
texts_loc,
config,
output_dir,
config_path,
config_overrides=overrides,
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
@ -74,52 +94,22 @@ def pretrain_cli(
def pretrain(
texts_loc: Path,
config: Config,
output_dir: Path,
config_path: Path,
config_overrides: Dict[str, Any] = {},
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
use_gpu: int=-1
):
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config)
pretrain_config = config["pretraining"]
if not pretrain_config:
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
seed = pretrain_config["seed"]
if seed is not None:
fix_random_seed(seed)
if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
if config["system"].get("seed") is not None:
fix_random_seed(config["system"]["seed"])
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
use_pytorch_for_gpu_memory()
config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
if texts_loc != "-": # reading from a file
with msg.loading("Loading input texts..."):
texts = list(srsly.read_jsonl(texts_loc))
random.shuffle(texts)
else: # reading from stdin
msg.info("Reading input text from stdin...")
texts = srsly.read_jsonl("-")
tok2vec_path = pretrain_config["tok2vec_model"]
tok2vec = config
for subpath in tok2vec_path.split("."):
tok2vec = tok2vec.get(subpath)
model = create_pretraining_model(nlp, tok2vec, pretrain_config)
optimizer = pretrain_config["optimizer"]
nlp, config = util.load_model_from_config(config)
P_cfg = config["pretraining"]
corpus = P_cfg["corpus"]
batcher = P_cfg["batcher"]
model = create_pretraining_model(nlp, config["pretraining"])
optimizer = config["pretraining"]["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
@ -147,38 +137,35 @@ def pretrain(
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n")
skip_counter = 0
objective = create_objective(pretrain_config["objective"])
for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
for batch_id, batch in enumerate(batches):
docs, count = make_docs(
nlp,
batch,
max_length=pretrain_config["max_length"],
min_length=pretrain_config["min_length"],
)
skip_counter += count
objective = create_objective(P_cfg["objective"])
# TODO: I think we probably want this to look more like the
# 'create_train_batches' function?
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
break
if pretrain_config["n_save_every"] and (
batch_id % pretrain_config["n_save_every"] == 0
if P_cfg["n_save_every"] and (
batch_id % P_cfg["n_save_every"] == 0
):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0
if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file
random.shuffle(texts)
if skip_counter > 0:
msg.warn(f"Skipped {skip_counter} empty values")
msg.good("Successfully finished pretrain")
def ensure_docs(examples_or_docs):
docs = []
for eg_or_doc in examples_or_docs:
if isinstance(eg_or_doc, Doc):
docs.append(eg_or_doc)
else:
docs.append(eg_or_doc.reference)
return docs
def _resume_model(model, resume_path, epoch_resume):
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func):
return float(loss)
def make_docs(nlp, batch, min_length, max_length):
docs = []
skip_count = 0
for record in batch:
if not isinstance(record, dict):
raise TypeError(Errors.E137.format(type=type(record), line=record))
if "tokens" in record:
words = record["tokens"]
if not words:
skip_count += 1
continue
doc = Doc(nlp.vocab, words=words)
elif "text" in record:
text = record["text"]
if not text:
skip_count += 1
continue
doc = nlp.make_doc(text)
else:
raise ValueError(Errors.E138.format(text=record))
if "heads" in record:
heads = record["heads"]
heads = numpy.asarray(heads, dtype="uint64")
heads = heads.reshape((len(doc), 1))
doc = doc.from_array([HEAD], heads)
if min_length <= len(doc) < max_length:
docs.append(doc)
return docs, skip_count
def create_objective(config):
"""Create the objective for pretraining.
@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
return loss, d_target
def create_pretraining_model(nlp, tok2vec, pretrain_config):
def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config):
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
"""
component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"])
else:
tok2vec = component.model
# TODO
maxout_pieces = 3
hidden_size = 300
@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4):
return format_str % figure
def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
if texts_loc != "-": # reading from a file
texts_loc = Path(texts_loc)
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
for text in srsly.read_jsonl(texts_loc):
break
else:
msg.fail("Input file is empty", texts_loc, exits=1)
if resume_path is not None:
model_name = re.search(r"model\d+\.bin", str(resume_path))
if not model_name and not epoch_resume:

View File

@ -7,7 +7,7 @@ import requests
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
from .._util import download_file, git_sparse_checkout
from .._util import download_file, git_sparse_checkout, get_git_version
@project_cli.command("assets")
@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None:
dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum")
if "git" in asset:
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. "
f"Make sure it's installed and that the executable is available."
)
get_git_version(error=git_err)
if dest.exists():
# If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest):

View File

@ -7,7 +7,7 @@ import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_sparse_checkout
from .._util import git_sparse_checkout, get_git_version
@project_cli.command("clone")
@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
try:
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
f"Cloning spaCy project templates requires Git and the 'git' command. ",
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually and then run:",
f"{COMMAND} project init {dest}",
exits=1,
)
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. ",
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually.",
)
get_git_version(error=git_err)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():

View File

@ -671,6 +671,9 @@ class Errors:
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
"that you are providing a list of patterns as `List[List[dict]]`.")
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
"through token.morph_ instead or add the string to the "
"StringStore with `nlp.vocab.strings.add(string)`.")
@add_codes

View File

@ -244,7 +244,8 @@ class Language:
self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline
if not self._config["training"].get("score_weights"):
self._config["training"]["score_weights"] = combine_score_weights(score_weights)
combined_score_weights = combine_score_weights(score_weights)
self._config["training"]["score_weights"] = combined_score_weights
if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config))
return self._config
@ -1166,14 +1167,20 @@ class Language:
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err)
valid_examples = False
for example in get_examples():
if not isinstance(example, Example):
err = Errors.E978.format(
name="Language.begin_training", types=type(example)
)
raise ValueError(err)
else:
valid_examples = True
for word in [t.text for t in example.reference]:
_ = self.vocab[word] # noqa: F841
if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err)
if device >= 0: # TODO: do we need this here?
require_gpu(device)
if self.vocab.vectors.data.shape[1] >= 1:
@ -1274,7 +1281,7 @@ class Language:
util.logger.debug(doc)
eg.predicted = doc
results = scorer.score(examples)
n_words = sum(len(eg.predicted) for eg in examples)
n_words = sum(len(doc) for doc in docs)
results["speed"] = n_words / (end_time - start_time)
return results

View File

@ -56,7 +56,7 @@ subword_features = true
@Language.factory(
"textcat",
assigns=["doc.cats"],
default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL},
default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
scores=[
"cats_score",
"cats_score_desc",
@ -75,6 +75,7 @@ def make_textcat(
name: str,
model: Model[List[Doc], List[Floats2d]],
labels: Iterable[str],
threshold: float,
) -> "TextCategorizer":
"""Create a TextCategorizer compoment. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels can
@ -86,8 +87,9 @@ def make_textcat(
scores for each category.
labels (list): A list of categories to learn. If empty, the model infers the
categories from the data.
threshold (float): Cutoff to consider a prediction "positive".
"""
return TextCategorizer(nlp.vocab, model, name, labels=labels)
return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)
class TextCategorizer(Pipe):
@ -103,6 +105,7 @@ class TextCategorizer(Pipe):
name: str = "textcat",
*,
labels: Iterable[str],
threshold: float,
) -> None:
"""Initialize a text categorizer.
@ -111,6 +114,7 @@ class TextCategorizer(Pipe):
name (str): The component instance name, used to add entries to the
losses during training.
labels (Iterable[str]): The labels to use.
threshold (float): Cutoff to consider a prediction "positive".
DOCS: https://nightly.spacy.io/api/textcategorizer#init
"""
@ -118,7 +122,7 @@ class TextCategorizer(Pipe):
self.model = model
self.name = name
self._rehearsal_model = None
cfg = {"labels": labels}
cfg = {"labels": labels, "threshold": threshold}
self.cfg = dict(cfg)
@property
@ -371,5 +375,6 @@ class TextCategorizer(Pipe):
labels=self.labels,
multi_label=self.model.attrs["multi_label"],
positive_label=positive_label,
threshold=self.cfg["threshold"],
**kwargs,
)

View File

@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel):
class ConfigSchemaPretrain(BaseModel):
# fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
min_length: StrictInt = Field(..., title="Minimum length of examples")
max_length: StrictInt = Field(..., title="Maximum length of examples")
dropout: StrictFloat = Field(..., title="Dropout rate")
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
seed: Optional[StrictInt] = Field(..., title="Random seed")
use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model")
optimizer: Optimizer = Field(..., title="The optimizer to use")
corpus: Reader = Field(..., title="Reader for the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
# TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
# fmt: on

View File

@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL
def _ner_example(ner):
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
doc = Doc(
ner.vocab,
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
)
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)

View File

@ -66,3 +66,31 @@ def test_morph_set(i_has):
def test_morph_str(i_has):
assert str(i_has[0].morph) == "PronType=prs"
assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
def test_morph_property(tokenizer):
doc = tokenizer("a dog")
# set through token.morph_
doc[0].morph_ = "PronType=prs"
assert doc[0].morph_ == "PronType=prs"
assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph
doc[0].morph = 0
assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_"
doc[0].morph_ = ""
assert doc[0].morph_ == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# "_" morph is also equivalent to empty morph
doc[0].morph_ = "_"
assert doc[0].morph_ == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# set through existing hash with token.morph
tokenizer.vocab.strings.add("Feat=Val")
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
assert doc[0].morph_ == "Feat=Val"

View File

@ -78,7 +78,7 @@ def patterns(en_vocab):
"REL_OP": ">",
"RIGHT_ID": "fox",
"RIGHT_ATTRS": {"ORTH": "fox"},
}
},
]
pattern5 = [
@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc):
assert matches == matches2
@pytest.mark.parametrize(
"op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
)
@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
# two sentences to test that all matches are within the same sentence
doc = get_doc(
@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
for text in ["a", "b", "c", "d", "e"]:
pattern = [
{"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
{"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},},
{"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
]
matcher = DependencyMatcher(en_vocab)
matcher.add("A", [pattern])

View File

@ -54,7 +54,10 @@ def _parser_example(parser):
def _ner_example(ner):
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
doc = Doc(
ner.vocab,
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
)
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)

View File

@ -30,9 +30,10 @@ TRAIN_DATA = [
),
]
def test_begin_training_examples():
nlp = Language()
senter = nlp.add_pipe("senter")
nlp.add_pipe("senter")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

View File

@ -89,7 +89,7 @@ def test_no_label():
def test_implicit_label():
nlp = Language()
textcat = nlp.add_pipe("textcat")
nlp.add_pipe("textcat")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

View File

@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"])
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
textcat.to_bytes(exclude=["vocab"])

View File

@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json():
assert ent.text in ["New York City", "London"]
def test_pretrain_make_docs():
nlp = English()
valid_jsonl_text = {"text": "Some text"}
docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
assert len(docs) == 1
assert skip_count == 0
valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
assert len(docs) == 1
assert skip_count == 0
invalid_jsonl_type = 0
with pytest.raises(TypeError):
make_docs(nlp, [invalid_jsonl_type], 1, 100)
invalid_jsonl_key = {"invalid": "Does not matter"}
with pytest.raises(ValueError):
make_docs(nlp, [invalid_jsonl_key], 1, 100)
empty_jsonl_text = {"text": ""}
docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
assert len(docs) == 0
assert skip_count == 1
empty_jsonl_tokens = {"tokens": []}
docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
assert len(docs) == 0
assert skip_count == 1
too_short_jsonl = {"text": "This text is not long enough"}
docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
assert len(docs) == 0
assert skip_count == 0
too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
assert len(docs) == 0
assert skip_count == 0
def test_project_config_validation_full():
config = {
"vars": {"some_var": 20},

View File

@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
doc = tokenizer(text)
assert [token.text for token in doc] == ["_SPECIAL_", "."]
def test_tokenizer_special_cases_idx(tokenizer):
text = "the _ID'X_"
tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
doc = tokenizer(text)
assert doc[1].idx == 4
assert doc[2].idx == 7

View File

@ -343,8 +343,9 @@ cdef class Tokenizer:
for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += cached.data.tokens[j].lex.length + \
1 if cached.data.tokens[j].spacy else 0
idx_offset += cached.data.tokens[j].lex.length
if cached.data.tokens[j].spacy:
idx_offset += 1
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span_end - span_start
offset += span[3]

View File

@ -214,9 +214,17 @@ cdef class Token:
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
@property
def morph(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
property morph:
def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, attr_t morph):
if morph == 0:
self.c.morph = morph
elif morph in self.vocab.strings:
self.morph_ = self.vocab.strings[morph]
else:
raise ValueError(Errors.E1009.format(val=morph))
property morph_:
def __get__(self):

View File

@ -1,6 +1,7 @@
import warnings
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from pathlib import Path
import srsly
from .. import util
from .example import Example
@ -21,6 +22,36 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]:
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
@util.registry.readers("spacy.JsonlReader.v1")
def create_jsonl_reader(
path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
) -> Callable[["Language"], Iterable[Doc]]:
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir() and path.parts[-1].endswith(file_type):
return [path]
orig_path = path
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts and path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith(file_type):
locs.append(path)
if len(locs) == 0:
warnings.warn(Warnings.W090.format(path=orig_path))
return locs
class Corpus:
"""Iterate Example objects from a file or directory of DocBin (.spacy)
@ -47,36 +78,13 @@ class Corpus:
*,
limit: int = 0,
gold_preproc: bool = False,
max_length: bool = False,
max_length: int = 0,
) -> None:
self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc
self.max_length = max_length
self.limit = limit
@staticmethod
def walk_corpus(path: Union[str, Path]) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
return [path]
orig_path = path
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts and path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith(FILE_TYPE):
locs.append(path)
if len(locs) == 0:
warnings.warn(Warnings.W090.format(path=orig_path))
return locs
def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data.
@ -85,11 +93,11 @@ class Corpus:
DOCS: https://nightly.spacy.io/api/corpus#call
"""
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
if self.gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
examples = self.make_examples(nlp, ref_docs, self.max_length)
examples = self.make_examples(nlp, ref_docs)
yield from examples
def _make_example(
@ -108,18 +116,18 @@ class Corpus:
return Example(nlp.make_doc(reference.text), reference)
def make_examples(
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]:
for reference in reference_docs:
if len(reference) == 0:
continue
elif max_length == 0 or len(reference) < max_length:
elif self.max_length == 0 or len(reference) < self.max_length:
yield self._make_example(nlp, reference, False)
elif reference.is_sentenced:
for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
elif max_length == 0 or len(ref_sent) < max_length:
elif self.max_length == 0 or len(ref_sent) < self.max_length:
yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(
@ -151,3 +159,57 @@ class Corpus:
i += 1
if self.limit >= 1 and i >= self.limit:
break
class JsonlTexts:
"""Iterate Doc objects from a file or directory of jsonl
formatted raw text files.
path (Path): The directory or filename to read from.
min_length (int): Minimum document length (in tokens). Shorter documents
will be skipped. Defaults to 0, which indicates no limit.
max_length (int): Maximum document length (in tokens). Longer documents will
be skipped. Defaults to 0, which indicates no limit.
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
Defaults to 0, which indicates no limit.
DOCS: https://nightly.spacy.io/api/corpus
"""
file_type = "jsonl"
def __init__(
self,
path: Union[str, Path],
*,
limit: int = 0,
min_length: int = 0,
max_length: int = 0,
) -> None:
self.path = util.ensure_path(path)
self.min_length = min_length
self.max_length = max_length
self.limit = limit
def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data.
nlp (Language): The current nlp object.
YIELDS (Doc): The docs.
DOCS: https://nightly.spacy.io/api/corpus#call
"""
for loc in walk_corpus(self.path, "jsonl"):
records = srsly.read_jsonl(loc)
for record in records:
doc = nlp.make_doc(record["text"])
if self.min_length >= 1 and len(doc) < self.min_length:
continue
elif self.max_length >= 1 and len(doc) >= self.max_length:
continue
else:
words = [w.text for w in doc]
spaces = [bool(w.whitespace_) for w in doc]
# We don't *need* an example here, but it seems nice to
# make it match the Corpus signature.
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))

View File

@ -36,20 +36,12 @@ def console_logger():
keys=list(info["losses"].keys()),
)
) from None
try:
scores = [
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100)
for col in score_cols
]
except KeyError as e:
raise KeyError(
Errors.E983.format(
dict="scores (other)",
key=str(e),
keys=list(info["other_scores"].keys()),
)
) from None
scores = []
for col in score_cols:
score = float(info["other_scores"].get(col, 0.0))
if col != "speed":
score *= 100
scores.append("{0:.2f}".format(score))
data = (
[info["epoch"], info["step"]]
+ losses

View File

@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str:
return " ".join(shlex.quote(cmd) for cmd in command)
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
def run_command(
command: Union[str, List[str]],
*,
capture: bool = False,
stdin: Optional[Any] = None,
) -> Optional[subprocess.CompletedProcess]:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
stdin (Optional[Any]): stdin to read from or None.
capture (bool): Whether to capture the output.
RETURNS (Optional[CompletedProcess]): The process object.
"""
if isinstance(command, str):
command = split_command(command)
@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
raise FileNotFoundError(
Errors.E970.format(str_command=" ".join(command), tool=command[0])
) from None
except subprocess.CalledProcessError as e:
# We don't want a duplicate traceback here
print(e)
sys.exit(1)
if ret.returncode != 0:
sys.exit(ret.returncode)
return ret

View File

@ -14,6 +14,7 @@ menu:
- ['evaluate', 'evaluate']
- ['package', 'package']
- ['project', 'project']
- ['ray', 'ray']
---
spaCy's CLI provides a range of helpful commands for downloading and training
@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
## ray {#ray new="3"}
The `spacy ray` CLI includes commands for parallel and distributed computing via
[Ray](https://ray.io).
<Infobox variant="warning">
To use this command, you need the
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
Installing the package will automatically add the `ray` command to the spaCy
CLI.
</Infobox>
### ray train {#ray-train tag="command"}
Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
command works just like [`spacy train`](/api/cli#train). For more details and
examples, see the usage guide on
[parallel training](/usage/training#parallel-training) and the spaCy project
[integration](/usage/projects#ray).
```cli
$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
```
> #### Example
>
> ```cli
> $ python -m spacy ray train config.cfg --n-workers 2
> ```
| Name | Description |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |

View File

@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
> config = {
> "labels": [],
> "threshold": 0.5,
> "model": DEFAULT_TEXTCAT_MODEL,
> }
> nlp.add_pipe("textcat", config=config)
> ```
| Setting | Description |
| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
| Setting | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/textcat.py
@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters.
>
> # Construction from class
> from spacy.pipeline import TextCategorizer
> textcat = TextCategorizer(nlp.vocab, model)
> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
> ```
Create a new pipeline instance. In your application, you would normally use a
@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `labels` | The labels to use. ~~Iterable[str]~~ |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"}

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 67 KiB

View File

@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.
<figure>
| System | POS | USA | LAS |
| System | POS | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
| spaCy RoBERTa (2020) | | | |
| spaCy CNN (2020) | | | |

View File

@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
<Benchmarks />
<!-- TODO:
<Project id="benchmarks/penn_treebank">
<Project id="benchmarks/parsing_penn_treebank">
The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
our project template.
</Project>
-->
<!-- ## Citing spaCy {#citation}
<!-- TODO: update -->

View File

@ -796,11 +796,9 @@ workflows, including
evaluation workflow that lets you compare two different models and their
results.
<Project id="integrations/prodigy">
<!-- TODO: <Project id="integrations/prodigy">
<!-- TODO: -->
</Project>
</Project> -->
---
@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components.
> #### Installation
>
> ```bash
> $ pip install "spacy_streamlit>=1.0.0a0"
> $ pip install "spacy-streamlit>=1.0.0a0"
> ```
![](../images/spacy-streamlit.png)
@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
</Infobox>
<!-- TODO: document -->
> #### Installation
>
> ```cli
> $ pip install spacy-ray
> # Check that the CLI is registered
> $ python -m spacy ray --help
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray for parallel and distributed
training with spaCy via our lightweight
[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
package is installed in the same environment as spaCy, it will automatically add
[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
`project.yml` just like the regular training command:
<!-- prettier-ignore -->
```yaml
### project.yml
- name: "ray"
help: "Train a model via parallel training with Ray"
script:
- "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
deps:
- "corpus/train.spacy"
- "corpus/dev.spacy"
```
<!-- TODO: <Project id="integrations/ray">
</Project> -->
---
@ -943,12 +973,14 @@ your results.
![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')
<!-- TODO:
<Project id="integrations/wandb">
Get started with tracking your spaCy training runs in Weights & Biases using our
project template. It includes a simple config using the `WandbLogger`, as well
as a custom logger implementation you can adjust for your specific use case.
<!-- TODO: -->
</Project>
-->

View File

@ -1075,7 +1075,7 @@ relations and tokens we want to match:
> #### Visualizing the parse
>
> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects
> The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects
> and their dependency parse and part-of-speech tags:
>
> ```python

View File

@ -7,7 +7,7 @@ menu:
- ['Quickstart', 'quickstart']
- ['Config System', 'config']
- ['Custom Functions', 'custom-functions']
# - ['Parallel Training', 'parallel-training']
- ['Parallel Training', 'parallel-training']
- ['Internal API', 'api']
---
@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
## Parallel & distributed training with Ray {#parallel-training}
> #### Installation
>
> ```cli
> $ pip install spacy-ray
> # Check that the CLI is registered
> $ python -m spacy ray --help
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray to train spaCy on one or more
remote machines, potentially speeding up your training process. Parallel
training won't always be faster though it depends on your batch size, models,
and hardware.
<Infobox variant="warning">
To use Ray with spaCy, you need the
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
Installing the package will automatically add the `ray` command to the spaCy
CLI.
</Infobox>
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
setup. You can optionally set the `--address` option to point to your Ray
cluster. If it's not set, Ray will run locally.
```cli
python -m spacy ray train config.cfg --n-workers 2
```
<!-- TODO: <Project id="integrations/ray">
</Project> -->
### How parallel training works {#parallel-training-details}
Each worker receives a shard of the **data** and builds a copy of the **model
and optimizer** from the [`config.cfg`](#config). It also has a communication
channel to **pass gradients and parameters** to the other workers. Additionally,
each worker is given ownership of a subset of the parameter arrays. Every
parameter array is owned by exactly one worker, and the workers are given a
mapping so they know which worker owns which parameter.
![Illustration of setup](../images/spacy-ray.svg)
As training proceeds, every worker will be computing gradients for **all** of
the model parameters. When they compute gradients for parameters they don't own,
they'll **send them to the worker** that does own that parameter, along with a
version identifier so that the owner can decide whether the discard the
gradient. Workers use the gradients they receive and the ones they compute
locally to update the parameters they own, and then broadcast the updated array
and a new version ID to the other workers.
This training procedure is **asynchronous** and **non-blocking**. Workers always
push their gradient increments and parameter updates, they do not have to pull
them and block on the result, so the transfers can happen in the background,
overlapped with the actual training work. The workers also do not have to stop
and wait for each other ("synchronize") at the start of each batch. This is very
useful for spaCy, because spaCy is often trained on long documents, which means
**batches can vary in size** significantly. Uneven workloads make synchronous
gradient descent inefficient, because if one batch is slow, all of the other
workers are stuck waiting for it to complete before they can continue.
## Internal training API {#api}
<Infobox variant="warning">

View File

@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
- [Training & config system](#features-training)
- [Custom models](#features-custom-models)
- [End-to-end project workflows](#features-projects)
- [Parallel training with Ray](#features-parallel-training)
- [New built-in components](#features-pipeline-components)
- [New custom component API](#features-components)
- [Dependency matching](#features-dep-matcher)
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
</Infobox>
### Parallel and distributed training with Ray {#features-parallel-training}
> #### Example
>
> ```cli
> $ pip install spacy-ray
> # Check that the CLI is registered
> $ python -m spacy ray --help
> # Train a pipeline
> $ python -m spacy ray train config.cfg --n-workers 2
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray to train spaCy on one or more
remote machines, potentially speeding up your training process. The Ray
integration is powered by a lightweight extension package,
[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
parallel training.
![Illustration of setup](../images/spacy-ray.svg)
<Infobox title="Details & Documentation" emoji="📖" list>
- **Usage: **
[Parallel and distributed training](/usage/training#parallel-training),
[spaCy Projects integration](/usage/projects#ray)
- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
</Infobox>
### New built-in pipeline components {#features-pipeline-components}
spaCy v3.0 includes several new trainable and rule-based components that you can
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
### New and updated documentation {#new-docs}

View File

@ -26,11 +26,27 @@ const replacements = {
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
}
/**
* Compute the overall total counts of models and languages
*/
function getCounts(langs = []) {
return {
langs: langs.length,
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
starters: langs
.map(({ starters }) => (starters ? starters.length : 0))
.reduce((a, b) => a + b, 0),
}
}
module.exports = {
siteMetadata: {
...site,
sidebars,
...models,
counts: getCounts(models.languages),
universe,
nightly: isNightly,
binderBranch,

View File

@ -1,5 +1,16 @@
{
"resources": [
{
"id": "spacy-ray",
"title": "spacy-ray",
"slogan": "Parallel and distributed training with spaCy and Ray",
"description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
"github": "explosion/spacy-ray",
"pip": "spacy-ray",
"category": ["training"],
"author": "Explosion / Anyscale",
"thumb": "https://i.imgur.com/7so6ZpS.png"
},
{
"id": "spacy-sentence-bert",
"title": "spaCy - sentence-transformers",
@ -2518,14 +2529,14 @@
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
"pip": "cov-bsv",
"code_example": [
"import cov_bsv",
"",
"nlp = cov_bsv.load()",
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
"",
"print(doc.ents)",
"print(doc._.cov_classification)",
"cov_bsv.visualize_doc(doc)"
"import cov_bsv",
"",
"nlp = cov_bsv.load()",
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
"",
"print(doc.ents)",
"print(doc._.cov_classification)",
"cov_bsv.visualize_doc(doc)"
],
"category": ["pipeline", "standalone", "biomedical", "scientific"],
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"],

View File

@ -14,6 +14,7 @@ import GitHubCode from './github'
import classes from '../styles/code.module.sass'
const WRAP_THRESHOLD = 30
const CLI_GROUPS = ['init', 'debug', 'project', 'ray']
export default props => (
<Pre>
@ -99,7 +100,6 @@ function replacePrompt(line, prompt, isFirst = false) {
}
function parseArgs(raw) {
const commandGroups = ['init', 'debug', 'project']
let args = raw.split(' ').filter(arg => arg)
const result = {}
while (args.length) {
@ -108,7 +108,12 @@ function parseArgs(raw) {
const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
result[opt] = isFlag ? true : args.shift()
} else {
const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt
let key = opt
if (CLI_GROUPS.includes(opt)) {
if (args.length && !args[0].startsWith('-')) {
key = `${opt} ${args.shift()}`
}
}
result[key] = null
}
}

View File

@ -38,8 +38,8 @@ export const LandingSubtitle = ({ children }) => (
)
export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => (
<Content className={classNames(classes.grid, { [classes.blocks]: blocks })}>
<Grid cols={cols} narrow={blocks} style={style}>
<Content className={classNames({ [classes.blocks]: blocks })}>
<Grid cols={cols} narrow={blocks} className={classes.grid} style={style}>
{children}
</Grid>
</Content>

View File

@ -26,8 +26,11 @@
border-bottom-right-radius: 0
.icon
width: 2rem
height: 2rem
$width: 2rem
width: $width
height: $width
flex: 0 0 $width
background: var(--color-theme)
color: var(--color-back)
border-radius: 50%

View File

@ -128,14 +128,17 @@
padding-right: 2rem
@include breakpoint(max, md)
.banner
padding: 1rem 3rem
.banner-content
display: block
.banner-text
padding-top: 0
.col
grid-column: 1 / span 2
.grid
grid-template-columns: 1fr !important
.banner-button
margin-bottom: var(--spacing-sm)

View File

@ -54,23 +54,8 @@ for entity in doc.ents:
print(entity.text, entity.label_)
`
/**
* Compute the overall total counts of models and languages
*/
function getCounts(langs = []) {
return {
langs: langs.length,
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
starters: langs
.map(({ starters }) => (starters ? starters.length : 0))
.reduce((a, b) => a + b, 0),
}
}
const Landing = ({ data }) => {
const counts = getCounts(data.languages)
const { counts } = data
return (
<>
<LandingHeader nightly={data.nightly}>
@ -345,7 +330,10 @@ const landingQuery = graphql`
siteMetadata {
nightly
repo
languages {
counts {
langs
modelLangs
starterLangs
models
starters
}