diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 360d2439a..649c2b373 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -301,6 +301,7 @@ def ensure_pathy(path):
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
+ git_version = get_git_version()
if dest.exists():
msg.fail("Destination of checkout must not exist", exits=1)
if not dest.parent.exists():
@@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
# *that* we can do by path.
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
- git_version = get_git_version()
supports_sparse = git_version >= (2, 22)
# This is the "clone, but don't download anything" part.
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
if supports_sparse:
cmd += f"--filter=blob:none" # <-- The key bit
else:
- msg.warn(
+ err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
- f"that doesn't fully support sparse checkout yet. This means that "
- f"more files than necessary may be downloaded temporarily. To "
- f"only download the files needed, upgrade to Git v2.22 or above."
+ f"that doesn't fully support sparse checkout yet."
)
- _attempt_run_command(cmd)
+ err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
+ msg.warn(
+ f"{err_unk if git_version == (0, 0) else err_old} "
+ f"This means that more files than necessary may be downloaded "
+ f"temporarily. To only download the files needed, make sure "
+ f"you're using Git v2.22 or above."
+ )
+ try_run_command(cmd)
# Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
- ret = _attempt_run_command(cmd)
+ ret = try_run_command(cmd)
git_repo = _from_http_to_git(repo)
# Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
@@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
msg.fail(err, exits=1)
if supports_sparse:
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
- _attempt_run_command(cmd)
+ try_run_command(cmd)
# And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
- _attempt_run_command(cmd)
+ try_run_command(cmd)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
-def get_git_version() -> Tuple[int, int]:
- ret = _attempt_run_command(["git", "--version"])
- # TODO: this seems kinda brittle?
- version = ret.stdout[11:].strip().split(".")
+def get_git_version(
+ error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
+) -> Tuple[int, int]:
+ """Get the version of git and raise an error if calling 'git --version' fails.
+
+ error (str): The error message to show.
+ RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
+ (0, 0) if the version couldn't be determined.
+ """
+ ret = try_run_command(["git", "--version"], error=error)
+ stdout = ret.stdout.strip()
+ if not stdout or not stdout.startswith("git version"):
+ return (0, 0)
+ version = stdout[11:].strip().split(".")
return (int(version[0]), int(version[1]))
-def _attempt_run_command(cmd: Union[str, List[str]]):
+def try_run_command(
+ cmd: Union[str, List[str]], error: str = "Could not run command"
+) -> subprocess.CompletedProcess:
+ """Try running a command and raise an error if it fails.
+
+ cmd (Union[str, List[str]]): The command to run.
+ error (str): The error message.
+ RETURNS (CompletedProcess): The completed process if the command ran.
+ """
try:
return run_command(cmd, capture=True)
except subprocess.CalledProcessError as e:
- err = f"Could not run command"
- msg.fail(err)
+ msg.fail(error)
print(cmd)
sys.exit(1)
@@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str:
return repo
-def string_to_list(value, intify=False):
- """Parse a comma-separated string to a list"""
+def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
+ """Parse a comma-separated string to a list and account for various
+ formatting options. Mostly used to handle CLI arguments that take a list of
+ comma-separated values.
+
+ value (str): The value to parse.
+ intify (bool): Whether to convert values to ints.
+ RETURNS (Union[List[str], List[int]]): A list of strings or ints.
+ """
if not value:
return []
if value.startswith("[") and value.endswith("]"):
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 1a250e43e..a4899a458 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation
import typer
-from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
+from ._util import Arg, Opt, debug_cli, show_validation_error
+from ._util import parse_config_overrides, string_to_list
from .. import util
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 9eab7b54d..05bf99ccd 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -277,7 +277,7 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
def ensure_shape(lines):
"""Ensure that the first line of the data is the vectors shape.
-
+
If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem.
"""
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 828e5f08e..70858123d 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,10 +1,10 @@
-from typing import Optional, Dict, Any
-import random
+from typing import Optional
import numpy
import time
import re
from collections import Counter
from pathlib import Path
+from thinc.api import Config
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance
@@ -15,11 +15,10 @@ import typer
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
-from ..errors import Errors
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..tokens import Doc
-from ..attrs import ID, HEAD
+from ..attrs import ID
from .. import util
@@ -30,9 +29,8 @@ from .. import util
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
- texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
- output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
+ output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
@@ -60,13 +58,35 @@ def pretrain_cli(
DOCS: https://nightly.spacy.io/api/cli#pretrain
"""
- overrides = parse_config_overrides(ctx.args)
+ config_overrides = parse_config_overrides(ctx.args)
import_code(code_path)
+ verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
+ if use_gpu >= 0:
+ msg.info("Using GPU")
+ require_gpu(use_gpu)
+ else:
+ msg.info("Using CPU")
+ msg.info(f"Loading config from: {config_path}")
+
+ with show_validation_error(config_path):
+ config = util.load_config(
+ config_path,
+ overrides=config_overrides,
+ interpolate=True
+ )
+ if not config.get("pretraining"):
+ # TODO: What's the solution here? How do we handle optional blocks?
+ msg.fail("The [pretraining] block in your config is empty", exits=1)
+ if not output_dir.exists():
+ output_dir.mkdir()
+ msg.good(f"Created output directory: {output_dir}")
+
+ config.to_disk(output_dir / "config.cfg")
+ msg.good("Saved config file in the output directory")
+
pretrain(
- texts_loc,
+ config,
output_dir,
- config_path,
- config_overrides=overrides,
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
@@ -74,52 +94,22 @@ def pretrain_cli(
def pretrain(
- texts_loc: Path,
+ config: Config,
output_dir: Path,
- config_path: Path,
- config_overrides: Dict[str, Any] = {},
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
- use_gpu: int = -1,
+ use_gpu: int=-1
):
- verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
- if use_gpu >= 0:
- msg.info("Using GPU")
- require_gpu(use_gpu)
- else:
- msg.info("Using CPU")
- msg.info(f"Loading config from: {config_path}")
- with show_validation_error(config_path):
- config = util.load_config(config_path, overrides=config_overrides)
- nlp, config = util.load_model_from_config(config)
- pretrain_config = config["pretraining"]
- if not pretrain_config:
- # TODO: What's the solution here? How do we handle optional blocks?
- msg.fail("The [pretraining] block in your config is empty", exits=1)
- if not output_dir.exists():
- output_dir.mkdir()
- msg.good(f"Created output directory: {output_dir}")
- seed = pretrain_config["seed"]
- if seed is not None:
- fix_random_seed(seed)
- if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
+ if config["system"].get("seed") is not None:
+ fix_random_seed(config["system"]["seed"])
+ if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
use_pytorch_for_gpu_memory()
- config.to_disk(output_dir / "config.cfg")
- msg.good("Saved config file in the output directory")
- if texts_loc != "-": # reading from a file
- with msg.loading("Loading input texts..."):
- texts = list(srsly.read_jsonl(texts_loc))
- random.shuffle(texts)
- else: # reading from stdin
- msg.info("Reading input text from stdin...")
- texts = srsly.read_jsonl("-")
-
- tok2vec_path = pretrain_config["tok2vec_model"]
- tok2vec = config
- for subpath in tok2vec_path.split("."):
- tok2vec = tok2vec.get(subpath)
- model = create_pretraining_model(nlp, tok2vec, pretrain_config)
- optimizer = pretrain_config["optimizer"]
+ nlp, config = util.load_model_from_config(config)
+ P_cfg = config["pretraining"]
+ corpus = P_cfg["corpus"]
+ batcher = P_cfg["batcher"]
+ model = create_pretraining_model(nlp, config["pretraining"])
+ optimizer = config["pretraining"]["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
@@ -147,38 +137,35 @@ def pretrain(
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n")
- skip_counter = 0
- objective = create_objective(pretrain_config["objective"])
- for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
- batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
- for batch_id, batch in enumerate(batches):
- docs, count = make_docs(
- nlp,
- batch,
- max_length=pretrain_config["max_length"],
- min_length=pretrain_config["min_length"],
- )
- skip_counter += count
+ objective = create_objective(P_cfg["objective"])
+ # TODO: I think we probably want this to look more like the
+ # 'create_train_batches' function?
+ for epoch in range(epoch_resume, P_cfg["max_epochs"]):
+ for batch_id, batch in enumerate(batcher(corpus(nlp))):
+ docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
- if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
- break
- if pretrain_config["n_save_every"] and (
- batch_id % pretrain_config["n_save_every"] == 0
+ if P_cfg["n_save_every"] and (
+ batch_id % P_cfg["n_save_every"] == 0
):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0
- if texts_loc != "-":
- # Reshuffle the texts if texts were loaded from a file
- random.shuffle(texts)
- if skip_counter > 0:
- msg.warn(f"Skipped {skip_counter} empty values")
msg.good("Successfully finished pretrain")
+def ensure_docs(examples_or_docs):
+ docs = []
+ for eg_or_doc in examples_or_docs:
+ if isinstance(eg_or_doc, Doc):
+ docs.append(eg_or_doc)
+ else:
+ docs.append(eg_or_doc.reference)
+ return docs
+
+
def _resume_model(model, resume_path, epoch_resume):
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
@@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func):
return float(loss)
-def make_docs(nlp, batch, min_length, max_length):
- docs = []
- skip_count = 0
- for record in batch:
- if not isinstance(record, dict):
- raise TypeError(Errors.E137.format(type=type(record), line=record))
- if "tokens" in record:
- words = record["tokens"]
- if not words:
- skip_count += 1
- continue
- doc = Doc(nlp.vocab, words=words)
- elif "text" in record:
- text = record["text"]
- if not text:
- skip_count += 1
- continue
- doc = nlp.make_doc(text)
- else:
- raise ValueError(Errors.E138.format(text=record))
- if "heads" in record:
- heads = record["heads"]
- heads = numpy.asarray(heads, dtype="uint64")
- heads = heads.reshape((len(doc), 1))
- doc = doc.from_array([HEAD], heads)
- if min_length <= len(doc) < max_length:
- docs.append(doc)
- return docs, skip_count
-
-
def create_objective(config):
"""Create the objective for pretraining.
@@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
return loss, d_target
-def create_pretraining_model(nlp, tok2vec, pretrain_config):
+def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
@@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config):
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
"""
+ component = nlp.get_pipe(pretrain_config["component"])
+ if pretrain_config.get("layer"):
+ tok2vec = component.model.get_ref(pretrain_config["layer"])
+ else:
+ tok2vec = component.model
+
# TODO
maxout_pieces = 3
hidden_size = 300
@@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4):
return format_str % figure
-def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
+def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
@@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
- if texts_loc != "-": # reading from a file
- texts_loc = Path(texts_loc)
- if not texts_loc.exists():
- msg.fail("Input text file doesn't exist", texts_loc, exits=1)
-
- for text in srsly.read_jsonl(texts_loc):
- break
- else:
- msg.fail("Input file is empty", texts_loc, exits=1)
-
if resume_path is not None:
model_name = re.search(r"model\d+\.bin", str(resume_path))
if not model_name and not epoch_resume:
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 7326b2e5c..a8b607e05 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -7,7 +7,7 @@ import requests
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
-from .._util import download_file, git_sparse_checkout
+from .._util import download_file, git_sparse_checkout, get_git_version
@project_cli.command("assets")
@@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None:
dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum")
if "git" in asset:
+ git_err = (
+ f"Cloning spaCy project templates requires Git and the 'git' command. "
+ f"Make sure it's installed and that the executable is available."
+ )
+ get_git_version(error=git_err)
if dest.exists():
# If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest):
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index ab617e4ba..f691e523c 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -7,7 +7,7 @@ import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_sparse_checkout
+from .._util import git_sparse_checkout, get_git_version
@project_cli.command("clone")
@@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
- try:
- subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
- except Exception:
- msg.fail(
- f"Cloning spaCy project templates requires Git and the 'git' command. ",
- f"To clone a project without Git, copy the files from the '{name}' "
- f"directory in the {repo} to {dest} manually and then run:",
- f"{COMMAND} project init {dest}",
- exits=1,
- )
+ git_err = (
+ f"Cloning spaCy project templates requires Git and the 'git' command. ",
+ f"To clone a project without Git, copy the files from the '{name}' "
+ f"directory in the {repo} to {dest} manually.",
+ )
+ get_git_version(error=git_err)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
diff --git a/spacy/errors.py b/spacy/errors.py
index 7164598b6..8f95609a6 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -671,6 +671,9 @@ class Errors:
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
"that you are providing a list of patterns as `List[List[dict]]`.")
+ E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
+ "through token.morph_ instead or add the string to the "
+ "StringStore with `nlp.vocab.strings.add(string)`.")
@add_codes
diff --git a/spacy/language.py b/spacy/language.py
index 70dad59f3..905cdca36 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -244,7 +244,8 @@ class Language:
self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline
if not self._config["training"].get("score_weights"):
- self._config["training"]["score_weights"] = combine_score_weights(score_weights)
+ combined_score_weights = combine_score_weights(score_weights)
+ self._config["training"]["score_weights"] = combined_score_weights
if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config))
return self._config
@@ -1166,14 +1167,20 @@ class Language:
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err)
+ valid_examples = False
for example in get_examples():
if not isinstance(example, Example):
err = Errors.E978.format(
name="Language.begin_training", types=type(example)
)
raise ValueError(err)
+ else:
+ valid_examples = True
for word in [t.text for t in example.reference]:
_ = self.vocab[word] # noqa: F841
+ if not valid_examples:
+ err = Errors.E930.format(name="Language", obj="empty list")
+ raise ValueError(err)
if device >= 0: # TODO: do we need this here?
require_gpu(device)
if self.vocab.vectors.data.shape[1] >= 1:
@@ -1274,7 +1281,7 @@ class Language:
util.logger.debug(doc)
eg.predicted = doc
results = scorer.score(examples)
- n_words = sum(len(eg.predicted) for eg in examples)
+ n_words = sum(len(doc) for doc in docs)
results["speed"] = n_words / (end_time - start_time)
return results
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 4be6f580d..22d1de08f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -56,7 +56,7 @@ subword_features = true
@Language.factory(
"textcat",
assigns=["doc.cats"],
- default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL},
+ default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
scores=[
"cats_score",
"cats_score_desc",
@@ -75,6 +75,7 @@ def make_textcat(
name: str,
model: Model[List[Doc], List[Floats2d]],
labels: Iterable[str],
+ threshold: float,
) -> "TextCategorizer":
"""Create a TextCategorizer compoment. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels can
@@ -86,8 +87,9 @@ def make_textcat(
scores for each category.
labels (list): A list of categories to learn. If empty, the model infers the
categories from the data.
+ threshold (float): Cutoff to consider a prediction "positive".
"""
- return TextCategorizer(nlp.vocab, model, name, labels=labels)
+ return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)
class TextCategorizer(Pipe):
@@ -103,6 +105,7 @@ class TextCategorizer(Pipe):
name: str = "textcat",
*,
labels: Iterable[str],
+ threshold: float,
) -> None:
"""Initialize a text categorizer.
@@ -111,6 +114,7 @@ class TextCategorizer(Pipe):
name (str): The component instance name, used to add entries to the
losses during training.
labels (Iterable[str]): The labels to use.
+ threshold (float): Cutoff to consider a prediction "positive".
DOCS: https://nightly.spacy.io/api/textcategorizer#init
"""
@@ -118,7 +122,7 @@ class TextCategorizer(Pipe):
self.model = model
self.name = name
self._rehearsal_model = None
- cfg = {"labels": labels}
+ cfg = {"labels": labels, "threshold": threshold}
self.cfg = dict(cfg)
@property
@@ -371,5 +375,6 @@ class TextCategorizer(Pipe):
labels=self.labels,
multi_label=self.model.attrs["multi_label"],
positive_label=positive_label,
+ threshold=self.cfg["threshold"],
**kwargs,
)
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 38f47c668..0dd2b9204 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel):
class ConfigSchemaPretrain(BaseModel):
# fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
- min_length: StrictInt = Field(..., title="Minimum length of examples")
- max_length: StrictInt = Field(..., title="Maximum length of examples")
dropout: StrictFloat = Field(..., title="Dropout rate")
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
- batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
- seed: Optional[StrictInt] = Field(..., title="Random seed")
- use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
- tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model")
optimizer: Optimizer = Field(..., title="The optimizer to use")
+ corpus: Reader = Field(..., title="Reader for the training data")
+ batcher: Batcher = Field(..., title="Batcher for the training data")
+ component: str = Field(..., title="Component to find the layer to pretrain")
+ layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
+
# TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
# fmt: on
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 751bd36d4..0c2a2a40b 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL
def _ner_example(ner):
- doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+ doc = Doc(
+ ner.vocab,
+ words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
+ )
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 6bfc198fd..f378ce042 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -66,3 +66,31 @@ def test_morph_set(i_has):
def test_morph_str(i_has):
assert str(i_has[0].morph) == "PronType=prs"
assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
+
+
+def test_morph_property(tokenizer):
+ doc = tokenizer("a dog")
+
+ # set through token.morph_
+ doc[0].morph_ = "PronType=prs"
+ assert doc[0].morph_ == "PronType=prs"
+ assert doc.to_array(["MORPH"])[0] != 0
+
+ # unset with token.morph
+ doc[0].morph = 0
+ assert doc.to_array(["MORPH"])[0] == 0
+
+ # empty morph is equivalent to "_"
+ doc[0].morph_ = ""
+ assert doc[0].morph_ == ""
+ assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
+
+ # "_" morph is also equivalent to empty morph
+ doc[0].morph_ = "_"
+ assert doc[0].morph_ == ""
+ assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
+
+ # set through existing hash with token.morph
+ tokenizer.vocab.strings.add("Feat=Val")
+ doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
+ assert doc[0].morph_ == "Feat=Val"
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 72005cc82..6361a10ce 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -78,7 +78,7 @@ def patterns(en_vocab):
"REL_OP": ">",
"RIGHT_ID": "fox",
"RIGHT_ATTRS": {"ORTH": "fox"},
- }
+ },
]
pattern5 = [
@@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc):
assert matches == matches2
-@pytest.mark.parametrize(
- "op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
-)
+@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
# two sentences to test that all matches are within the same sentence
doc = get_doc(
@@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
for text in ["a", "b", "c", "d", "e"]:
pattern = [
{"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
- {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},},
+ {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
]
matcher = DependencyMatcher(en_vocab)
matcher.add("A", [pattern])
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 0da42daa2..3d67e6ef6 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -54,7 +54,10 @@ def _parser_example(parser):
def _ner_example(ner):
- doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+ doc = Doc(
+ ner.vocab,
+ words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
+ )
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 1752df5d0..5827f8ff1 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -30,9 +30,10 @@ TRAIN_DATA = [
),
]
+
def test_begin_training_examples():
nlp = Language()
- senter = nlp.add_pipe("senter")
+ nlp.add_pipe("senter")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3f9506bb1..d12a7211a 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -89,7 +89,7 @@ def test_no_label():
def test_implicit_label():
nlp = Language()
- textcat = nlp.add_pipe("textcat")
+ nlp.add_pipe("textcat")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index db62f6569..e621aebd8 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
- textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"])
+ textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
textcat.to_bytes(exclude=["vocab"])
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 0df707dc0..586f79afc 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
-from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
@@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json():
assert ent.text in ["New York City", "London"]
-def test_pretrain_make_docs():
- nlp = English()
-
- valid_jsonl_text = {"text": "Some text"}
- docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
- assert len(docs) == 1
- assert skip_count == 0
-
- valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
- docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
- assert len(docs) == 1
- assert skip_count == 0
-
- invalid_jsonl_type = 0
- with pytest.raises(TypeError):
- make_docs(nlp, [invalid_jsonl_type], 1, 100)
-
- invalid_jsonl_key = {"invalid": "Does not matter"}
- with pytest.raises(ValueError):
- make_docs(nlp, [invalid_jsonl_key], 1, 100)
-
- empty_jsonl_text = {"text": ""}
- docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
- assert len(docs) == 0
- assert skip_count == 1
-
- empty_jsonl_tokens = {"tokens": []}
- docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
- assert len(docs) == 0
- assert skip_count == 1
-
- too_short_jsonl = {"text": "This text is not long enough"}
- docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
- assert len(docs) == 0
- assert skip_count == 0
-
- too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
- docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
- assert len(docs) == 0
- assert skip_count == 0
-
-
def test_project_config_validation_full():
config = {
"vars": {"some_var": 20},
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index ff31ae8a9..23c2d5c47 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
doc = tokenizer(text)
assert [token.text for token in doc] == ["_SPECIAL_", "."]
+
+
+def test_tokenizer_special_cases_idx(tokenizer):
+ text = "the _ID'X_"
+ tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
+ doc = tokenizer(text)
+ assert doc[1].idx == 4
+ assert doc[2].idx == 7
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 787cca652..17714940d 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -343,8 +343,9 @@ cdef class Tokenizer:
for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset
- idx_offset += cached.data.tokens[j].lex.length + \
- 1 if cached.data.tokens[j].spacy else 0
+ idx_offset += cached.data.tokens[j].lex.length
+ if cached.data.tokens[j].spacy:
+ idx_offset += 1
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span_end - span_start
offset += span[3]
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 50f1c5da3..2474f0637 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -214,9 +214,17 @@ cdef class Token:
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
- @property
- def morph(self):
- return MorphAnalysis.from_id(self.vocab, self.c.morph)
+ property morph:
+ def __get__(self):
+ return MorphAnalysis.from_id(self.vocab, self.c.morph)
+
+ def __set__(self, attr_t morph):
+ if morph == 0:
+ self.c.morph = morph
+ elif morph in self.vocab.strings:
+ self.morph_ = self.vocab.strings[morph]
+ else:
+ raise ValueError(Errors.E1009.format(val=morph))
property morph_:
def __get__(self):
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 545f01eaa..20e4507aa 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,6 +1,7 @@
import warnings
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from pathlib import Path
+import srsly
from .. import util
from .example import Example
@@ -21,6 +22,36 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]:
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
+@util.registry.readers("spacy.JsonlReader.v1")
+def create_jsonl_reader(
+ path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
+) -> Callable[["Language"], Iterable[Doc]]:
+ return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
+
+
+def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
+ path = util.ensure_path(path)
+ if not path.is_dir() and path.parts[-1].endswith(file_type):
+ return [path]
+ orig_path = path
+ paths = [path]
+ locs = []
+ seen = set()
+ for path in paths:
+ if str(path) in seen:
+ continue
+ seen.add(str(path))
+ if path.parts and path.parts[-1].startswith("."):
+ continue
+ elif path.is_dir():
+ paths.extend(path.iterdir())
+ elif path.parts[-1].endswith(file_type):
+ locs.append(path)
+ if len(locs) == 0:
+ warnings.warn(Warnings.W090.format(path=orig_path))
+ return locs
+
+
class Corpus:
"""Iterate Example objects from a file or directory of DocBin (.spacy)
@@ -47,36 +78,13 @@ class Corpus:
*,
limit: int = 0,
gold_preproc: bool = False,
- max_length: bool = False,
+ max_length: int = 0,
) -> None:
self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc
self.max_length = max_length
self.limit = limit
- @staticmethod
- def walk_corpus(path: Union[str, Path]) -> List[Path]:
- path = util.ensure_path(path)
- if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
- return [path]
- orig_path = path
- paths = [path]
- locs = []
- seen = set()
- for path in paths:
- if str(path) in seen:
- continue
- seen.add(str(path))
- if path.parts and path.parts[-1].startswith("."):
- continue
- elif path.is_dir():
- paths.extend(path.iterdir())
- elif path.parts[-1].endswith(FILE_TYPE):
- locs.append(path)
- if len(locs) == 0:
- warnings.warn(Warnings.W090.format(path=orig_path))
- return locs
-
def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data.
@@ -85,11 +93,11 @@ class Corpus:
DOCS: https://nightly.spacy.io/api/corpus#call
"""
- ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
+ ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
if self.gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
- examples = self.make_examples(nlp, ref_docs, self.max_length)
+ examples = self.make_examples(nlp, ref_docs)
yield from examples
def _make_example(
@@ -108,18 +116,18 @@ class Corpus:
return Example(nlp.make_doc(reference.text), reference)
def make_examples(
- self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
+ self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]:
for reference in reference_docs:
if len(reference) == 0:
continue
- elif max_length == 0 or len(reference) < max_length:
+ elif self.max_length == 0 or len(reference) < self.max_length:
yield self._make_example(nlp, reference, False)
elif reference.is_sentenced:
for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
- elif max_length == 0 or len(ref_sent) < max_length:
+ elif self.max_length == 0 or len(ref_sent) < self.max_length:
yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(
@@ -151,3 +159,57 @@ class Corpus:
i += 1
if self.limit >= 1 and i >= self.limit:
break
+
+
+class JsonlTexts:
+ """Iterate Doc objects from a file or directory of jsonl
+ formatted raw text files.
+
+ path (Path): The directory or filename to read from.
+ min_length (int): Minimum document length (in tokens). Shorter documents
+ will be skipped. Defaults to 0, which indicates no limit.
+
+ max_length (int): Maximum document length (in tokens). Longer documents will
+ be skipped. Defaults to 0, which indicates no limit.
+ limit (int): Limit corpus to a subset of examples, e.g. for debugging.
+ Defaults to 0, which indicates no limit.
+
+ DOCS: https://nightly.spacy.io/api/corpus
+ """
+ file_type = "jsonl"
+
+ def __init__(
+ self,
+ path: Union[str, Path],
+ *,
+ limit: int = 0,
+ min_length: int = 0,
+ max_length: int = 0,
+ ) -> None:
+ self.path = util.ensure_path(path)
+ self.min_length = min_length
+ self.max_length = max_length
+ self.limit = limit
+
+ def __call__(self, nlp: "Language") -> Iterator[Example]:
+ """Yield examples from the data.
+
+ nlp (Language): The current nlp object.
+ YIELDS (Doc): The docs.
+
+ DOCS: https://nightly.spacy.io/api/corpus#call
+ """
+ for loc in walk_corpus(self.path, "jsonl"):
+ records = srsly.read_jsonl(loc)
+ for record in records:
+ doc = nlp.make_doc(record["text"])
+ if self.min_length >= 1 and len(doc) < self.min_length:
+ continue
+ elif self.max_length >= 1 and len(doc) >= self.max_length:
+ continue
+ else:
+ words = [w.text for w in doc]
+ spaces = [bool(w.whitespace_) for w in doc]
+ # We don't *need* an example here, but it seems nice to
+ # make it match the Corpus signature.
+ yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index e071e5827..66fe25ed6 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -36,20 +36,12 @@ def console_logger():
keys=list(info["losses"].keys()),
)
) from None
-
- try:
- scores = [
- "{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100)
- for col in score_cols
- ]
- except KeyError as e:
- raise KeyError(
- Errors.E983.format(
- dict="scores (other)",
- key=str(e),
- keys=list(info["other_scores"].keys()),
- )
- ) from None
+ scores = []
+ for col in score_cols:
+ score = float(info["other_scores"].get(col, 0.0))
+ if col != "speed":
+ score *= 100
+ scores.append("{0:.2f}".format(score))
data = (
[info["epoch"], info["step"]]
+ losses
diff --git a/spacy/util.py b/spacy/util.py
index d8df04554..d1df1f92a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str:
return " ".join(shlex.quote(cmd) for cmd in command)
-def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
+def run_command(
+ command: Union[str, List[str]],
+ *,
+ capture: bool = False,
+ stdin: Optional[Any] = None,
+) -> Optional[subprocess.CompletedProcess]:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
+ stdin (Optional[Any]): stdin to read from or None.
+ capture (bool): Whether to capture the output.
+ RETURNS (Optional[CompletedProcess]): The process object.
"""
if isinstance(command, str):
command = split_command(command)
@@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
raise FileNotFoundError(
Errors.E970.format(str_command=" ".join(command), tool=command[0])
) from None
+ except subprocess.CalledProcessError as e:
+ # We don't want a duplicate traceback here
+ print(e)
+ sys.exit(1)
if ret.returncode != 0:
sys.exit(ret.returncode)
return ret
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 55e552e72..c27efb2e4 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -14,6 +14,7 @@ menu:
- ['evaluate', 'evaluate']
- ['package', 'package']
- ['project', 'project']
+ - ['ray', 'ray']
---
spaCy's CLI provides a range of helpful commands for downloading and training
@@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
+
+## ray {#ray new="3"}
+
+The `spacy ray` CLI includes commands for parallel and distributed computing via
+[Ray](https://ray.io).
+
+
+
+To use this command, you need the
+[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
+Installing the package will automatically add the `ray` command to the spaCy
+CLI.
+
+
+
+### ray train {#ray-train tag="command"}
+
+Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
+command works just like [`spacy train`](/api/cli#train). For more details and
+examples, see the usage guide on
+[parallel training](/usage/training#parallel-training) and the spaCy project
+[integration](/usage/projects#ray).
+
+```cli
+$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
+```
+
+> #### Example
+>
+> ```cli
+> $ python -m spacy ray train config.cfg --n-workers 2
+> ```
+
+| Name | Description |
+| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
+| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
+| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
+| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
+| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
+| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index cc20d6fd2..9bdc6324f 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
> config = {
> "labels": [],
+> "threshold": 0.5,
> "model": DEFAULT_TEXTCAT_MODEL,
> }
> nlp.add_pipe("textcat", config=config)
> ```
-| Setting | Description |
-| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
-| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting | Description |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
+| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/textcat.py
@@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters.
>
> # Construction from class
> from spacy.pipeline import TextCategorizer
-> textcat = TextCategorizer(nlp.vocab, model)
+> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
> ```
Create a new pipeline instance. In your application, you would normally use a
@@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `labels` | The labels to use. ~~Iterable[str]~~ |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/images/spacy-ray.svg b/website/docs/images/spacy-ray.svg
new file mode 100644
index 000000000..4c2fd81f1
--- /dev/null
+++ b/website/docs/images/spacy-ray.svg
@@ -0,0 +1,55 @@
+
diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 0c04dd8d5..33163f306 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.
-| System | POS | USA | LAS |
+| System | POS | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
| spaCy RoBERTa (2020) | | | |
| spaCy CNN (2020) | | | |
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 36f86dd51..bff31d0d6 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
-
-
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 81ddf40fb..b5c3a5356 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -796,11 +796,9 @@ workflows, including
evaluation workflow that lets you compare two different models and their
results.
-
+
-
-
+ -->
---
@@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components.
> #### Installation
>
> ```bash
-> $ pip install "spacy_streamlit>=1.0.0a0"
+> $ pip install "spacy-streamlit>=1.0.0a0"
> ```

@@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
-
+> #### Installation
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray for parallel and distributed
+training with spaCy via our lightweight
+[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
+package is installed in the same environment as spaCy, it will automatically add
+[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
+
+You can integrate [`spacy ray train`](/api/cli#ray-train) into your
+`project.yml` just like the regular training command:
+
+
+```yaml
+### project.yml
+- name: "ray"
+ help: "Train a model via parallel training with Ray"
+ script:
+ - "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
+ deps:
+ - "corpus/train.spacy"
+ - "corpus/dev.spacy"
+```
+
+
---
@@ -943,12 +973,14 @@ your results.

+
-
+
+-->
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 2d6159f3d..7e979b32e 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -1075,7 +1075,7 @@ relations and tokens we want to match:
> #### Visualizing the parse
>
-> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects
+> The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects
> and their dependency parse and part-of-speech tags:
>
> ```python
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 4b25d1c21..76e2bdeca 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -7,7 +7,7 @@ menu:
- ['Quickstart', 'quickstart']
- ['Config System', 'config']
- ['Custom Functions', 'custom-functions']
- # - ['Parallel Training', 'parallel-training']
+ - ['Parallel Training', 'parallel-training']
- ['Internal API', 'api']
---
@@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
+## Parallel & distributed training with Ray {#parallel-training}
+
+> #### Installation
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray to train spaCy on one or more
+remote machines, potentially speeding up your training process. Parallel
+training won't always be faster though – it depends on your batch size, models,
+and hardware.
+
+
+
+To use Ray with spaCy, you need the
+[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
+Installing the package will automatically add the `ray` command to the spaCy
+CLI.
+
+
+
+The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
+[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
+setup. You can optionally set the `--address` option to point to your Ray
+cluster. If it's not set, Ray will run locally.
+
+```cli
+python -m spacy ray train config.cfg --n-workers 2
+```
+
+
+
+### How parallel training works {#parallel-training-details}
+
+Each worker receives a shard of the **data** and builds a copy of the **model
+and optimizer** from the [`config.cfg`](#config). It also has a communication
+channel to **pass gradients and parameters** to the other workers. Additionally,
+each worker is given ownership of a subset of the parameter arrays. Every
+parameter array is owned by exactly one worker, and the workers are given a
+mapping so they know which worker owns which parameter.
+
+
+
+As training proceeds, every worker will be computing gradients for **all** of
+the model parameters. When they compute gradients for parameters they don't own,
+they'll **send them to the worker** that does own that parameter, along with a
+version identifier so that the owner can decide whether the discard the
+gradient. Workers use the gradients they receive and the ones they compute
+locally to update the parameters they own, and then broadcast the updated array
+and a new version ID to the other workers.
+
+This training procedure is **asynchronous** and **non-blocking**. Workers always
+push their gradient increments and parameter updates, they do not have to pull
+them and block on the result, so the transfers can happen in the background,
+overlapped with the actual training work. The workers also do not have to stop
+and wait for each other ("synchronize") at the start of each batch. This is very
+useful for spaCy, because spaCy is often trained on long documents, which means
+**batches can vary in size** significantly. Uneven workloads make synchronous
+gradient descent inefficient, because if one batch is slow, all of the other
+workers are stuck waiting for it to complete before they can continue.
+
## Internal training API {#api}
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 791b641df..171320267 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
- [Training & config system](#features-training)
- [Custom models](#features-custom-models)
- [End-to-end project workflows](#features-projects)
+- [Parallel training with Ray](#features-parallel-training)
- [New built-in components](#features-pipeline-components)
- [New custom component API](#features-components)
- [Dependency matching](#features-dep-matcher)
@@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
+### Parallel and distributed training with Ray {#features-parallel-training}
+
+> #### Example
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> # Train a pipeline
+> $ python -m spacy ray train config.cfg --n-workers 2
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray to train spaCy on one or more
+remote machines, potentially speeding up your training process. The Ray
+integration is powered by a lightweight extension package,
+[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
+the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
+same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
+parallel training.
+
+
+
+
+
+- **Usage: **
+ [Parallel and distributed training](/usage/training#parallel-training),
+ [spaCy Projects integration](/usage/projects#ray)
+- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
+- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
+
+
+
### New built-in pipeline components {#features-pipeline-components}
spaCy v3.0 includes several new trainable and rule-based components that you can
@@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
+| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
### New and updated documentation {#new-docs}
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 78fdc336f..5e3b5b537 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -26,11 +26,27 @@ const replacements = {
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
}
+/**
+ * Compute the overall total counts of models and languages
+ */
+function getCounts(langs = []) {
+ return {
+ langs: langs.length,
+ modelLangs: langs.filter(({ models }) => models && !!models.length).length,
+ starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
+ models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
+ starters: langs
+ .map(({ starters }) => (starters ? starters.length : 0))
+ .reduce((a, b) => a + b, 0),
+ }
+}
+
module.exports = {
siteMetadata: {
...site,
sidebars,
...models,
+ counts: getCounts(models.languages),
universe,
nightly: isNightly,
binderBranch,
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 0419a7207..010ff3618 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,16 @@
{
"resources": [
+ {
+ "id": "spacy-ray",
+ "title": "spacy-ray",
+ "slogan": "Parallel and distributed training with spaCy and Ray",
+ "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
+ "github": "explosion/spacy-ray",
+ "pip": "spacy-ray",
+ "category": ["training"],
+ "author": "Explosion / Anyscale",
+ "thumb": "https://i.imgur.com/7so6ZpS.png"
+ },
{
"id": "spacy-sentence-bert",
"title": "spaCy - sentence-transformers",
@@ -2518,14 +2529,14 @@
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
"pip": "cov-bsv",
"code_example": [
- "import cov_bsv",
- "",
- "nlp = cov_bsv.load()",
- "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
- "",
- "print(doc.ents)",
- "print(doc._.cov_classification)",
- "cov_bsv.visualize_doc(doc)"
+ "import cov_bsv",
+ "",
+ "nlp = cov_bsv.load()",
+ "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
+ "",
+ "print(doc.ents)",
+ "print(doc._.cov_classification)",
+ "cov_bsv.visualize_doc(doc)"
],
"category": ["pipeline", "standalone", "biomedical", "scientific"],
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
diff --git a/website/src/components/code.js b/website/src/components/code.js
index 5a7828a33..fad1d2b7f 100644
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@@ -14,6 +14,7 @@ import GitHubCode from './github'
import classes from '../styles/code.module.sass'
const WRAP_THRESHOLD = 30
+const CLI_GROUPS = ['init', 'debug', 'project', 'ray']
export default props => (