diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 360d2439a..649c2b373 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -301,6 +301,7 @@ def ensure_pathy(path): def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"): + git_version = get_git_version() if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): @@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: - git_version = get_git_version() supports_sparse = git_version >= (2, 22) # This is the "clone, but don't download anything" part. cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " if supports_sparse: cmd += f"--filter=blob:none" # <-- The key bit else: - msg.warn( + err_old = ( f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " - f"that doesn't fully support sparse checkout yet. This means that " - f"more files than necessary may be downloaded temporarily. To " - f"only download the files needed, upgrade to Git v2.22 or above." + f"that doesn't fully support sparse checkout yet." ) - _attempt_run_command(cmd) + err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." + msg.warn( + f"{err_unk if git_version == (0, 0) else err_old} " + f"This means that more files than necessary may be downloaded " + f"temporarily. To only download the files needed, make sure " + f"you're using Git v2.22 or above." + ) + try_run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}" - ret = _attempt_run_command(cmd) + ret = try_run_command(cmd) git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) @@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m msg.fail(err, exits=1) if supports_sparse: cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" - _attempt_run_command(cmd) + try_run_command(cmd) # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" - _attempt_run_command(cmd) + try_run_command(cmd) # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) -def get_git_version() -> Tuple[int, int]: - ret = _attempt_run_command(["git", "--version"]) - # TODO: this seems kinda brittle? - version = ret.stdout[11:].strip().split(".") +def get_git_version( + error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", +) -> Tuple[int, int]: + """Get the version of git and raise an error if calling 'git --version' fails. + + error (str): The error message to show. + RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns + (0, 0) if the version couldn't be determined. + """ + ret = try_run_command(["git", "--version"], error=error) + stdout = ret.stdout.strip() + if not stdout or not stdout.startswith("git version"): + return (0, 0) + version = stdout[11:].strip().split(".") return (int(version[0]), int(version[1])) -def _attempt_run_command(cmd: Union[str, List[str]]): +def try_run_command( + cmd: Union[str, List[str]], error: str = "Could not run command" +) -> subprocess.CompletedProcess: + """Try running a command and raise an error if it fails. + + cmd (Union[str, List[str]]): The command to run. + error (str): The error message. + RETURNS (CompletedProcess): The completed process if the command ran. + """ try: return run_command(cmd, capture=True) except subprocess.CalledProcessError as e: - err = f"Could not run command" - msg.fail(err) + msg.fail(error) print(cmd) sys.exit(1) @@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str: return repo -def string_to_list(value, intify=False): - """Parse a comma-separated string to a list""" +def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: + """Parse a comma-separated string to a list and account for various + formatting options. Mostly used to handle CLI arguments that take a list of + comma-separated values. + + value (str): The value to parse. + intify (bool): Whether to convert values to ints. + RETURNS (Union[List[str], List[int]]): A list of strings or ints. + """ if not value: return [] if value.startswith("[") and value.endswith("]"): diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 1a250e43e..a4899a458 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation import typer -from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list +from ._util import Arg, Opt, debug_cli, show_validation_error +from ._util import parse_config_overrides, string_to_list from .. import util diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 9eab7b54d..05bf99ccd 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -277,7 +277,7 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): def ensure_shape(lines): """Ensure that the first line of the data is the vectors shape. - + If it's not, we read in the data and output the shape as the first result, so that the reader doesn't have to deal with the problem. """ diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 828e5f08e..70858123d 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,10 +1,10 @@ -from typing import Optional, Dict, Any -import random +from typing import Optional import numpy import time import re from collections import Counter from pathlib import Path +from thinc.api import Config from thinc.api import use_pytorch_for_gpu_memory, require_gpu from thinc.api import set_dropout_rate, to_categorical, fix_random_seed from thinc.api import CosineDistance, L2Distance @@ -15,11 +15,10 @@ import typer from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code -from ..errors import Errors from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..tokens import Doc -from ..attrs import ID, HEAD +from ..attrs import ID from .. import util @@ -30,9 +29,8 @@ from .. import util def pretrain_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True), - output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), + output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), @@ -60,13 +58,35 @@ def pretrain_cli( DOCS: https://nightly.spacy.io/api/cli#pretrain """ - overrides = parse_config_overrides(ctx.args) + config_overrides = parse_config_overrides(ctx.args) import_code(code_path) + verify_cli_args(config_path, output_dir, resume_path, epoch_resume) + if use_gpu >= 0: + msg.info("Using GPU") + require_gpu(use_gpu) + else: + msg.info("Using CPU") + msg.info(f"Loading config from: {config_path}") + + with show_validation_error(config_path): + config = util.load_config( + config_path, + overrides=config_overrides, + interpolate=True + ) + if not config.get("pretraining"): + # TODO: What's the solution here? How do we handle optional blocks? + msg.fail("The [pretraining] block in your config is empty", exits=1) + if not output_dir.exists(): + output_dir.mkdir() + msg.good(f"Created output directory: {output_dir}") + + config.to_disk(output_dir / "config.cfg") + msg.good("Saved config file in the output directory") + pretrain( - texts_loc, + config, output_dir, - config_path, - config_overrides=overrides, resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, @@ -74,52 +94,22 @@ def pretrain_cli( def pretrain( - texts_loc: Path, + config: Config, output_dir: Path, - config_path: Path, - config_overrides: Dict[str, Any] = {}, resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, - use_gpu: int = -1, + use_gpu: int=-1 ): - verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume) - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") - msg.info(f"Loading config from: {config_path}") - with show_validation_error(config_path): - config = util.load_config(config_path, overrides=config_overrides) - nlp, config = util.load_model_from_config(config) - pretrain_config = config["pretraining"] - if not pretrain_config: - # TODO: What's the solution here? How do we handle optional blocks? - msg.fail("The [pretraining] block in your config is empty", exits=1) - if not output_dir.exists(): - output_dir.mkdir() - msg.good(f"Created output directory: {output_dir}") - seed = pretrain_config["seed"] - if seed is not None: - fix_random_seed(seed) - if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]: + if config["system"].get("seed") is not None: + fix_random_seed(config["system"]["seed"]) + if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"): use_pytorch_for_gpu_memory() - config.to_disk(output_dir / "config.cfg") - msg.good("Saved config file in the output directory") - if texts_loc != "-": # reading from a file - with msg.loading("Loading input texts..."): - texts = list(srsly.read_jsonl(texts_loc)) - random.shuffle(texts) - else: # reading from stdin - msg.info("Reading input text from stdin...") - texts = srsly.read_jsonl("-") - - tok2vec_path = pretrain_config["tok2vec_model"] - tok2vec = config - for subpath in tok2vec_path.split("."): - tok2vec = tok2vec.get(subpath) - model = create_pretraining_model(nlp, tok2vec, pretrain_config) - optimizer = pretrain_config["optimizer"] + nlp, config = util.load_model_from_config(config) + P_cfg = config["pretraining"] + corpus = P_cfg["corpus"] + batcher = P_cfg["batcher"] + model = create_pretraining_model(nlp, config["pretraining"]) + optimizer = config["pretraining"]["optimizer"] # Load in pretrained weights to resume from if resume_path is not None: @@ -147,38 +137,35 @@ def pretrain( with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") - skip_counter = 0 - objective = create_objective(pretrain_config["objective"]) - for epoch in range(epoch_resume, pretrain_config["max_epochs"]): - batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"]) - for batch_id, batch in enumerate(batches): - docs, count = make_docs( - nlp, - batch, - max_length=pretrain_config["max_length"], - min_length=pretrain_config["min_length"], - ) - skip_counter += count + objective = create_objective(P_cfg["objective"]) + # TODO: I think we probably want this to look more like the + # 'create_train_batches' function? + for epoch in range(epoch_resume, P_cfg["max_epochs"]): + for batch_id, batch in enumerate(batcher(corpus(nlp))): + docs = ensure_docs(batch) loss = make_update(model, docs, optimizer, objective) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) - if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: - break - if pretrain_config["n_save_every"] and ( - batch_id % pretrain_config["n_save_every"] == 0 + if P_cfg["n_save_every"] and ( + batch_id % P_cfg["n_save_every"] == 0 ): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 - if texts_loc != "-": - # Reshuffle the texts if texts were loaded from a file - random.shuffle(texts) - if skip_counter > 0: - msg.warn(f"Skipped {skip_counter} empty values") msg.good("Successfully finished pretrain") +def ensure_docs(examples_or_docs): + docs = [] + for eg_or_doc in examples_or_docs: + if isinstance(eg_or_doc, Doc): + docs.append(eg_or_doc) + else: + docs.append(eg_or_doc.reference) + return docs + + def _resume_model(model, resume_path, epoch_resume): msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: @@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func): return float(loss) -def make_docs(nlp, batch, min_length, max_length): - docs = [] - skip_count = 0 - for record in batch: - if not isinstance(record, dict): - raise TypeError(Errors.E137.format(type=type(record), line=record)) - if "tokens" in record: - words = record["tokens"] - if not words: - skip_count += 1 - continue - doc = Doc(nlp.vocab, words=words) - elif "text" in record: - text = record["text"] - if not text: - skip_count += 1 - continue - doc = nlp.make_doc(text) - else: - raise ValueError(Errors.E138.format(text=record)) - if "heads" in record: - heads = record["heads"] - heads = numpy.asarray(heads, dtype="uint64") - heads = heads.reshape((len(doc), 1)) - doc = doc.from_array([HEAD], heads) - if min_length <= len(doc) < max_length: - docs.append(doc) - return docs, skip_count - - def create_objective(config): """Create the objective for pretraining. @@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char): return loss, d_target -def create_pretraining_model(nlp, tok2vec, pretrain_config): +def create_pretraining_model(nlp, pretrain_config): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. @@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config): The actual tok2vec layer is stored as a reference, and only this bit will be serialized to file and read back in when calling the 'train' command. """ + component = nlp.get_pipe(pretrain_config["component"]) + if pretrain_config.get("layer"): + tok2vec = component.model.get_ref(pretrain_config["layer"]) + else: + tok2vec = component.model + # TODO maxout_pieces = 3 hidden_size = 300 @@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4): return format_str % figure -def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume): +def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir()]: @@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum "It is better to use an empty directory or refer to a new output path, " "then the new directory will be created for you.", ) - if texts_loc != "-": # reading from a file - texts_loc = Path(texts_loc) - if not texts_loc.exists(): - msg.fail("Input text file doesn't exist", texts_loc, exits=1) - - for text in srsly.read_jsonl(texts_loc): - break - else: - msg.fail("Input file is empty", texts_loc, exits=1) - if resume_path is not None: model_name = re.search(r"model\d+\.bin", str(resume_path)) if not model_name and not epoch_resume: diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 7326b2e5c..a8b607e05 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -7,7 +7,7 @@ import requests from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum -from .._util import download_file, git_sparse_checkout +from .._util import download_file, git_sparse_checkout, get_git_version @project_cli.command("assets") @@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") if "git" in asset: + git_err = ( + f"Cloning spaCy project templates requires Git and the 'git' command. " + f"Make sure it's installed and that the executable is available." + ) + get_git_version(error=git_err) if dest.exists(): # If there's already a file, check for checksum if checksum and checksum == get_checksum(dest): diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index ab617e4ba..f691e523c 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -7,7 +7,7 @@ import re from ... import about from ...util import ensure_path from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE -from .._util import git_sparse_checkout +from .._util import git_sparse_checkout, get_git_version @project_cli.command("clone") @@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None: dest (Path): Local destination of cloned directory. repo (str): URL of the repo to clone from. """ - try: - subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - f"Cloning spaCy project templates requires Git and the 'git' command. ", - f"To clone a project without Git, copy the files from the '{name}' " - f"directory in the {repo} to {dest} manually and then run:", - f"{COMMAND} project init {dest}", - exits=1, - ) + git_err = ( + f"Cloning spaCy project templates requires Git and the 'git' command. ", + f"To clone a project without Git, copy the files from the '{name}' " + f"directory in the {repo} to {dest} manually.", + ) + get_git_version(error=git_err) if not dest: msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) if dest.exists(): diff --git a/spacy/errors.py b/spacy/errors.py index 7164598b6..8f95609a6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -671,6 +671,9 @@ class Errors: E1007 = ("Unsupported DependencyMatcher operator '{op}'.") E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check " "that you are providing a list of patterns as `List[List[dict]]`.") + E1009 = ("String for hash '{val}' not found in StringStore. Set the value " + "through token.morph_ instead or add the string to the " + "StringStore with `nlp.vocab.strings.add(string)`.") @add_codes diff --git a/spacy/language.py b/spacy/language.py index 70dad59f3..905cdca36 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -244,7 +244,8 @@ class Language: self._config["nlp"]["disabled"] = list(self.disabled) self._config["components"] = pipeline if not self._config["training"].get("score_weights"): - self._config["training"]["score_weights"] = combine_score_weights(score_weights) + combined_score_weights = combine_score_weights(score_weights) + self._config["training"]["score_weights"] = combined_score_weights if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) return self._config @@ -1166,14 +1167,20 @@ class Language: if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="Language", obj=type(get_examples)) raise ValueError(err) + valid_examples = False for example in get_examples(): if not isinstance(example, Example): err = Errors.E978.format( name="Language.begin_training", types=type(example) ) raise ValueError(err) + else: + valid_examples = True for word in [t.text for t in example.reference]: _ = self.vocab[word] # noqa: F841 + if not valid_examples: + err = Errors.E930.format(name="Language", obj="empty list") + raise ValueError(err) if device >= 0: # TODO: do we need this here? require_gpu(device) if self.vocab.vectors.data.shape[1] >= 1: @@ -1274,7 +1281,7 @@ class Language: util.logger.debug(doc) eg.predicted = doc results = scorer.score(examples) - n_words = sum(len(eg.predicted) for eg in examples) + n_words = sum(len(doc) for doc in docs) results["speed"] = n_words / (end_time - start_time) return results diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 4be6f580d..22d1de08f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -56,7 +56,7 @@ subword_features = true @Language.factory( "textcat", assigns=["doc.cats"], - default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL}, + default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL}, scores=[ "cats_score", "cats_score_desc", @@ -75,6 +75,7 @@ def make_textcat( name: str, model: Model[List[Doc], List[Floats2d]], labels: Iterable[str], + threshold: float, ) -> "TextCategorizer": """Create a TextCategorizer compoment. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels can @@ -86,8 +87,9 @@ def make_textcat( scores for each category. labels (list): A list of categories to learn. If empty, the model infers the categories from the data. + threshold (float): Cutoff to consider a prediction "positive". """ - return TextCategorizer(nlp.vocab, model, name, labels=labels) + return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold) class TextCategorizer(Pipe): @@ -103,6 +105,7 @@ class TextCategorizer(Pipe): name: str = "textcat", *, labels: Iterable[str], + threshold: float, ) -> None: """Initialize a text categorizer. @@ -111,6 +114,7 @@ class TextCategorizer(Pipe): name (str): The component instance name, used to add entries to the losses during training. labels (Iterable[str]): The labels to use. + threshold (float): Cutoff to consider a prediction "positive". DOCS: https://nightly.spacy.io/api/textcategorizer#init """ @@ -118,7 +122,7 @@ class TextCategorizer(Pipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": labels} + cfg = {"labels": labels, "threshold": threshold} self.cfg = dict(cfg) @property @@ -371,5 +375,6 @@ class TextCategorizer(Pipe): labels=self.labels, multi_label=self.model.attrs["multi_label"], positive_label=positive_label, + threshold=self.cfg["threshold"], **kwargs, ) diff --git a/spacy/schemas.py b/spacy/schemas.py index 38f47c668..0dd2b9204 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel): class ConfigSchemaPretrain(BaseModel): # fmt: off max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") - min_length: StrictInt = Field(..., title="Minimum length of examples") - max_length: StrictInt = Field(..., title="Maximum length of examples") dropout: StrictFloat = Field(..., title="Dropout rate") n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") - batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule") - seed: Optional[StrictInt] = Field(..., title="Random seed") - use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch") - tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model") optimizer: Optimizer = Field(..., title="The optimizer to use") + corpus: Reader = Field(..., title="Reader for the training data") + batcher: Batcher = Field(..., title="Batcher for the training data") + component: str = Field(..., title="Component to find the layer to pretrain") + layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") + # TODO: use a more detailed schema for this? objective: Dict[str, Any] = Field(..., title="Pretraining objective") # fmt: on diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 751bd36d4..0c2a2a40b 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL def _ner_example(ner): - doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + doc = Doc( + ner.vocab, + words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"], + ) gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} return Example.from_dict(doc, gold) diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 6bfc198fd..f378ce042 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -66,3 +66,31 @@ def test_morph_set(i_has): def test_morph_str(i_has): assert str(i_has[0].morph) == "PronType=prs" assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin" + + +def test_morph_property(tokenizer): + doc = tokenizer("a dog") + + # set through token.morph_ + doc[0].morph_ = "PronType=prs" + assert doc[0].morph_ == "PronType=prs" + assert doc.to_array(["MORPH"])[0] != 0 + + # unset with token.morph + doc[0].morph = 0 + assert doc.to_array(["MORPH"])[0] == 0 + + # empty morph is equivalent to "_" + doc[0].morph_ = "" + assert doc[0].morph_ == "" + assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] + + # "_" morph is also equivalent to empty morph + doc[0].morph_ = "_" + assert doc[0].morph_ == "" + assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] + + # set through existing hash with token.morph + tokenizer.vocab.strings.add("Feat=Val") + doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") + assert doc[0].morph_ == "Feat=Val" diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index 72005cc82..6361a10ce 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -78,7 +78,7 @@ def patterns(en_vocab): "REL_OP": ">", "RIGHT_ID": "fox", "RIGHT_ATTRS": {"ORTH": "fox"}, - } + }, ] pattern5 = [ @@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc): assert matches == matches2 -@pytest.mark.parametrize( - "op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),] -) +@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)]) def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): # two sentences to test that all matches are within the same sentence doc = get_doc( @@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): for text in ["a", "b", "c", "d", "e"]: pattern = [ {"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}}, - {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},}, + {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}}, ] matcher = DependencyMatcher(en_vocab) matcher.add("A", [pattern]) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 0da42daa2..3d67e6ef6 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -54,7 +54,10 @@ def _parser_example(parser): def _ner_example(ner): - doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + doc = Doc( + ner.vocab, + words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"], + ) gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} return Example.from_dict(doc, gold) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 1752df5d0..5827f8ff1 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -30,9 +30,10 @@ TRAIN_DATA = [ ), ] + def test_begin_training_examples(): nlp = Language() - senter = nlp.add_pipe("senter") + nlp.add_pipe("senter") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 3f9506bb1..d12a7211a 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -89,7 +89,7 @@ def test_no_label(): def test_implicit_label(): nlp = Language() - textcat = nlp.add_pipe("textcat") + nlp.add_pipe("textcat") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index db62f6569..e621aebd8 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_TEXTCAT_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"]) + textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5) textcat.to_bytes(exclude=["vocab"]) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0df707dc0..586f79afc 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate -from spacy.cli.pretrain import make_docs from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables @@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json(): assert ent.text in ["New York City", "London"] -def test_pretrain_make_docs(): - nlp = English() - - valid_jsonl_text = {"text": "Some text"} - docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10) - assert len(docs) == 1 - assert skip_count == 0 - - valid_jsonl_tokens = {"tokens": ["Some", "tokens"]} - docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10) - assert len(docs) == 1 - assert skip_count == 0 - - invalid_jsonl_type = 0 - with pytest.raises(TypeError): - make_docs(nlp, [invalid_jsonl_type], 1, 100) - - invalid_jsonl_key = {"invalid": "Does not matter"} - with pytest.raises(ValueError): - make_docs(nlp, [invalid_jsonl_key], 1, 100) - - empty_jsonl_text = {"text": ""} - docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10) - assert len(docs) == 0 - assert skip_count == 1 - - empty_jsonl_tokens = {"tokens": []} - docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10) - assert len(docs) == 0 - assert skip_count == 1 - - too_short_jsonl = {"text": "This text is not long enough"} - docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15) - assert len(docs) == 0 - assert skip_count == 0 - - too_long_jsonl = {"text": "This text contains way too much tokens for this test"} - docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5) - assert len(docs) == 0 - assert skip_count == 0 - - def test_project_config_validation_full(): config = { "vars": {"some_var": 20}, diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index ff31ae8a9..23c2d5c47 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer): tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) doc = tokenizer(text) assert [token.text for token in doc] == ["_SPECIAL_", "."] + + +def test_tokenizer_special_cases_idx(tokenizer): + text = "the _ID'X_" + tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}]) + doc = tokenizer(text) + assert doc[1].idx == 4 + assert doc[2].idx == 7 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 787cca652..17714940d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -343,8 +343,9 @@ cdef class Tokenizer: for j in range(cached.length): tokens[i + offset + j] = cached.data.tokens[j] tokens[i + offset + j].idx = orig_idx + idx_offset - idx_offset += cached.data.tokens[j].lex.length + \ - 1 if cached.data.tokens[j].spacy else 0 + idx_offset += cached.data.tokens[j].lex.length + if cached.data.tokens[j].spacy: + idx_offset += 1 tokens[i + offset + cached.length - 1].spacy = orig_final_spacy i += span_end - span_start offset += span[3] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 50f1c5da3..2474f0637 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -214,9 +214,17 @@ cdef class Token: xp = get_array_module(vector) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - @property - def morph(self): - return MorphAnalysis.from_id(self.vocab, self.c.morph) + property morph: + def __get__(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) + + def __set__(self, attr_t morph): + if morph == 0: + self.c.morph = morph + elif morph in self.vocab.strings: + self.morph_ = self.vocab.strings[morph] + else: + raise ValueError(Errors.E1009.format(val=morph)) property morph_: def __get__(self): diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 545f01eaa..20e4507aa 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -1,6 +1,7 @@ import warnings from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable from pathlib import Path +import srsly from .. import util from .example import Example @@ -21,6 +22,36 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit) +@util.registry.readers("spacy.JsonlReader.v1") +def create_jsonl_reader( + path: Path, min_length: int=0, max_length: int = 0, limit: int = 0 +) -> Callable[["Language"], Iterable[Doc]]: + return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit) + + +def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: + path = util.ensure_path(path) + if not path.is_dir() and path.parts[-1].endswith(file_type): + return [path] + orig_path = path + paths = [path] + locs = [] + seen = set() + for path in paths: + if str(path) in seen: + continue + seen.add(str(path)) + if path.parts and path.parts[-1].startswith("."): + continue + elif path.is_dir(): + paths.extend(path.iterdir()) + elif path.parts[-1].endswith(file_type): + locs.append(path) + if len(locs) == 0: + warnings.warn(Warnings.W090.format(path=orig_path)) + return locs + + class Corpus: """Iterate Example objects from a file or directory of DocBin (.spacy) @@ -47,36 +78,13 @@ class Corpus: *, limit: int = 0, gold_preproc: bool = False, - max_length: bool = False, + max_length: int = 0, ) -> None: self.path = util.ensure_path(path) self.gold_preproc = gold_preproc self.max_length = max_length self.limit = limit - @staticmethod - def walk_corpus(path: Union[str, Path]) -> List[Path]: - path = util.ensure_path(path) - if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE): - return [path] - orig_path = path - paths = [path] - locs = [] - seen = set() - for path in paths: - if str(path) in seen: - continue - seen.add(str(path)) - if path.parts and path.parts[-1].startswith("."): - continue - elif path.is_dir(): - paths.extend(path.iterdir()) - elif path.parts[-1].endswith(FILE_TYPE): - locs.append(path) - if len(locs) == 0: - warnings.warn(Warnings.W090.format(path=orig_path)) - return locs - def __call__(self, nlp: "Language") -> Iterator[Example]: """Yield examples from the data. @@ -85,11 +93,11 @@ class Corpus: DOCS: https://nightly.spacy.io/api/corpus#call """ - ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path)) + ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE)) if self.gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) else: - examples = self.make_examples(nlp, ref_docs, self.max_length) + examples = self.make_examples(nlp, ref_docs) yield from examples def _make_example( @@ -108,18 +116,18 @@ class Corpus: return Example(nlp.make_doc(reference.text), reference) def make_examples( - self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0 + self, nlp: "Language", reference_docs: Iterable[Doc] ) -> Iterator[Example]: for reference in reference_docs: if len(reference) == 0: continue - elif max_length == 0 or len(reference) < max_length: + elif self.max_length == 0 or len(reference) < self.max_length: yield self._make_example(nlp, reference, False) elif reference.is_sentenced: for ref_sent in reference.sents: if len(ref_sent) == 0: continue - elif max_length == 0 or len(ref_sent) < max_length: + elif self.max_length == 0 or len(ref_sent) < self.max_length: yield self._make_example(nlp, ref_sent.as_doc(), False) def make_examples_gold_preproc( @@ -151,3 +159,57 @@ class Corpus: i += 1 if self.limit >= 1 and i >= self.limit: break + + +class JsonlTexts: + """Iterate Doc objects from a file or directory of jsonl + formatted raw text files. + + path (Path): The directory or filename to read from. + min_length (int): Minimum document length (in tokens). Shorter documents + will be skipped. Defaults to 0, which indicates no limit. + + max_length (int): Maximum document length (in tokens). Longer documents will + be skipped. Defaults to 0, which indicates no limit. + limit (int): Limit corpus to a subset of examples, e.g. for debugging. + Defaults to 0, which indicates no limit. + + DOCS: https://nightly.spacy.io/api/corpus + """ + file_type = "jsonl" + + def __init__( + self, + path: Union[str, Path], + *, + limit: int = 0, + min_length: int = 0, + max_length: int = 0, + ) -> None: + self.path = util.ensure_path(path) + self.min_length = min_length + self.max_length = max_length + self.limit = limit + + def __call__(self, nlp: "Language") -> Iterator[Example]: + """Yield examples from the data. + + nlp (Language): The current nlp object. + YIELDS (Doc): The docs. + + DOCS: https://nightly.spacy.io/api/corpus#call + """ + for loc in walk_corpus(self.path, "jsonl"): + records = srsly.read_jsonl(loc) + for record in records: + doc = nlp.make_doc(record["text"]) + if self.min_length >= 1 and len(doc) < self.min_length: + continue + elif self.max_length >= 1 and len(doc) >= self.max_length: + continue + else: + words = [w.text for w in doc] + spaces = [bool(w.whitespace_) for w in doc] + # We don't *need* an example here, but it seems nice to + # make it match the Corpus signature. + yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces)) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index e071e5827..66fe25ed6 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -36,20 +36,12 @@ def console_logger(): keys=list(info["losses"].keys()), ) ) from None - - try: - scores = [ - "{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100) - for col in score_cols - ] - except KeyError as e: - raise KeyError( - Errors.E983.format( - dict="scores (other)", - key=str(e), - keys=list(info["other_scores"].keys()), - ) - ) from None + scores = [] + for col in score_cols: + score = float(info["other_scores"].get(col, 0.0)) + if col != "speed": + score *= 100 + scores.append("{0:.2f}".format(score)) data = ( [info["epoch"], info["step"]] + losses diff --git a/spacy/util.py b/spacy/util.py index d8df04554..d1df1f92a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str: return " ".join(shlex.quote(cmd) for cmd in command) -def run_command(command: Union[str, List[str]], *, capture=False, stdin=None): +def run_command( + command: Union[str, List[str]], + *, + capture: bool = False, + stdin: Optional[Any] = None, +) -> Optional[subprocess.CompletedProcess]: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. command (str / List[str]): The command. If provided as a string, the string will be split using shlex.split. + stdin (Optional[Any]): stdin to read from or None. + capture (bool): Whether to capture the output. + RETURNS (Optional[CompletedProcess]): The process object. """ if isinstance(command, str): command = split_command(command) @@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None): raise FileNotFoundError( Errors.E970.format(str_command=" ".join(command), tool=command[0]) ) from None + except subprocess.CalledProcessError as e: + # We don't want a duplicate traceback here + print(e) + sys.exit(1) if ret.returncode != 0: sys.exit(ret.returncode) return ret diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 55e552e72..c27efb2e4 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -14,6 +14,7 @@ menu: - ['evaluate', 'evaluate'] - ['package', 'package'] - ['project', 'project'] + - ['ray', 'ray'] --- spaCy's CLI provides a range of helpful commands for downloading and training @@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] | `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | + +## ray {#ray new="3"} + +The `spacy ray` CLI includes commands for parallel and distributed computing via +[Ray](https://ray.io). + + + +To use this command, you need the +[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed. +Installing the package will automatically add the `ray` command to the spaCy +CLI. + + + +### ray train {#ray-train tag="command"} + +Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The +command works just like [`spacy train`](/api/cli#train). For more details and +examples, see the usage guide on +[parallel training](/usage/training#parallel-training) and the spaCy project +[integration](/usage/projects#ray). + +```cli +$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides] +``` + +> #### Example +> +> ```cli +> $ python -m spacy ray train config.cfg --n-workers 2 +> ``` + +| Name | Description | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | +| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ | +| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index cc20d6fd2..9bdc6324f 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters. > from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL > config = { > "labels": [], +> "threshold": 0.5, > "model": DEFAULT_TEXTCAT_MODEL, > } > nlp.add_pipe("textcat", config=config) > ``` -| Setting | Description | -| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | -| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/textcat.py @@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters. > > # Construction from class > from spacy.pipeline import TextCategorizer -> textcat = TextCategorizer(nlp.vocab, model) +> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5) > ``` Create a new pipeline instance. In your application, you would normally use a @@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | _keyword-only_ | | | `labels` | The labels to use. ~~Iterable[str]~~ | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/images/spacy-ray.svg b/website/docs/images/spacy-ray.svg new file mode 100644 index 000000000..4c2fd81f1 --- /dev/null +++ b/website/docs/images/spacy-ray.svg @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 0c04dd8d5..33163f306 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.
-| System | POS | USA | LAS | +| System | POS | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | ---: | | spaCy RoBERTa (2020) | | | | | spaCy CNN (2020) | | | | diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 36f86dd51..bff31d0d6 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md' - - diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 81ddf40fb..b5c3a5356 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -796,11 +796,9 @@ workflows, including evaluation workflow that lets you compare two different models and their results. - + - - + --> --- @@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components. > #### Installation > > ```bash -> $ pip install "spacy_streamlit>=1.0.0a0" +> $ pip install "spacy-streamlit>=1.0.0a0" > ``` ![](../images/spacy-streamlit.png) @@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main. - +> #### Installation +> +> ```cli +> $ pip install spacy-ray +> # Check that the CLI is registered +> $ python -m spacy ray --help +> ``` + +[Ray](https://ray.io/) is a fast and simple framework for building and running +**distributed applications**. You can use Ray for parallel and distributed +training with spaCy via our lightweight +[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the +package is installed in the same environment as spaCy, it will automatically add +[`spacy ray`](/api/cli#ray) commands to your spaCy CLI. + +You can integrate [`spacy ray train`](/api/cli#ray-train) into your +`project.yml` just like the regular training command: + + +```yaml +### project.yml +- name: "ray" + help: "Train a model via parallel training with Ray" + script: + - "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy" + deps: + - "corpus/train.spacy" + - "corpus/dev.spacy" +``` + + --- @@ -943,12 +973,14 @@ your results. ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values') + - + +--> diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 2d6159f3d..7e979b32e 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1075,7 +1075,7 @@ relations and tokens we want to match: > #### Visualizing the parse > -> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects +> The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects > and their dependency parse and part-of-speech tags: > > ```python diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 4b25d1c21..76e2bdeca 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -7,7 +7,7 @@ menu: - ['Quickstart', 'quickstart'] - ['Config System', 'config'] - ['Custom Functions', 'custom-functions'] - # - ['Parallel Training', 'parallel-training'] + - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: return create_model(output_width) ``` +## Parallel & distributed training with Ray {#parallel-training} + +> #### Installation +> +> ```cli +> $ pip install spacy-ray +> # Check that the CLI is registered +> $ python -m spacy ray --help +> ``` + +[Ray](https://ray.io/) is a fast and simple framework for building and running +**distributed applications**. You can use Ray to train spaCy on one or more +remote machines, potentially speeding up your training process. Parallel +training won't always be faster though – it depends on your batch size, models, +and hardware. + + + +To use Ray with spaCy, you need the +[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed. +Installing the package will automatically add the `ray` command to the spaCy +CLI. + + + +The [`spacy ray train`](/api/cli#ray-train) command follows the same API as +[`spacy train`](/api/cli#train), with a few extra options to configure the Ray +setup. You can optionally set the `--address` option to point to your Ray +cluster. If it's not set, Ray will run locally. + +```cli +python -m spacy ray train config.cfg --n-workers 2 +``` + + + +### How parallel training works {#parallel-training-details} + +Each worker receives a shard of the **data** and builds a copy of the **model +and optimizer** from the [`config.cfg`](#config). It also has a communication +channel to **pass gradients and parameters** to the other workers. Additionally, +each worker is given ownership of a subset of the parameter arrays. Every +parameter array is owned by exactly one worker, and the workers are given a +mapping so they know which worker owns which parameter. + +![Illustration of setup](../images/spacy-ray.svg) + +As training proceeds, every worker will be computing gradients for **all** of +the model parameters. When they compute gradients for parameters they don't own, +they'll **send them to the worker** that does own that parameter, along with a +version identifier so that the owner can decide whether the discard the +gradient. Workers use the gradients they receive and the ones they compute +locally to update the parameters they own, and then broadcast the updated array +and a new version ID to the other workers. + +This training procedure is **asynchronous** and **non-blocking**. Workers always +push their gradient increments and parameter updates, they do not have to pull +them and block on the result, so the transfers can happen in the background, +overlapped with the actual training work. The workers also do not have to stop +and wait for each other ("synchronize") at the start of each batch. This is very +useful for spaCy, because spaCy is often trained on long documents, which means +**batches can vary in size** significantly. Uneven workloads make synchronous +gradient descent inefficient, because if one batch is slow, all of the other +workers are stuck waiting for it to complete before they can continue. + ## Internal training API {#api} diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 791b641df..171320267 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases. - [Training & config system](#features-training) - [Custom models](#features-custom-models) - [End-to-end project workflows](#features-projects) +- [Parallel training with Ray](#features-parallel-training) - [New built-in components](#features-pipeline-components) - [New custom component API](#features-components) - [Dependency matching](#features-dep-matcher) @@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline. +### Parallel and distributed training with Ray {#features-parallel-training} + +> #### Example +> +> ```cli +> $ pip install spacy-ray +> # Check that the CLI is registered +> $ python -m spacy ray --help +> # Train a pipeline +> $ python -m spacy ray train config.cfg --n-workers 2 +> ``` + +[Ray](https://ray.io/) is a fast and simple framework for building and running +**distributed applications**. You can use Ray to train spaCy on one or more +remote machines, potentially speeding up your training process. The Ray +integration is powered by a lightweight extension package, +[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds +the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the +same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for +parallel training. + +![Illustration of setup](../images/spacy-ray.svg) + + + +- **Usage: ** + [Parallel and distributed training](/usage/training#parallel-training), + [spaCy Projects integration](/usage/projects#ray) +- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train) +- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray) + + + ### New built-in pipeline components {#features-pipeline-components} spaCy v3.0 includes several new trainable and rule-based components that you can @@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0. | [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. | | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | +| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. | ### New and updated documentation {#new-docs} diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 78fdc336f..5e3b5b537 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -26,11 +26,27 @@ const replacements = { GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, } +/** + * Compute the overall total counts of models and languages + */ +function getCounts(langs = []) { + return { + langs: langs.length, + modelLangs: langs.filter(({ models }) => models && !!models.length).length, + starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length, + models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0), + starters: langs + .map(({ starters }) => (starters ? starters.length : 0)) + .reduce((a, b) => a + b, 0), + } +} + module.exports = { siteMetadata: { ...site, sidebars, ...models, + counts: getCounts(models.languages), universe, nightly: isNightly, binderBranch, diff --git a/website/meta/universe.json b/website/meta/universe.json index 0419a7207..010ff3618 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,16 @@ { "resources": [ + { + "id": "spacy-ray", + "title": "spacy-ray", + "slogan": "Parallel and distributed training with spaCy and Ray", + "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.", + "github": "explosion/spacy-ray", + "pip": "spacy-ray", + "category": ["training"], + "author": "Explosion / Anyscale", + "thumb": "https://i.imgur.com/7so6ZpS.png" + }, { "id": "spacy-sentence-bert", "title": "spaCy - sentence-transformers", @@ -2518,14 +2529,14 @@ "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.", "pip": "cov-bsv", "code_example": [ - "import cov_bsv", - "", - "nlp = cov_bsv.load()", - "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'", - "", - "print(doc.ents)", - "print(doc._.cov_classification)", - "cov_bsv.visualize_doc(doc)" + "import cov_bsv", + "", + "nlp = cov_bsv.load()", + "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'", + "", + "print(doc.ents)", + "print(doc._.cov_classification)", + "cov_bsv.visualize_doc(doc)" ], "category": ["pipeline", "standalone", "biomedical", "scientific"], "tags": ["clinical", "epidemiology", "covid-19", "surveillance"], diff --git a/website/src/components/code.js b/website/src/components/code.js index 5a7828a33..fad1d2b7f 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -14,6 +14,7 @@ import GitHubCode from './github' import classes from '../styles/code.module.sass' const WRAP_THRESHOLD = 30 +const CLI_GROUPS = ['init', 'debug', 'project', 'ray'] export default props => (
@@ -99,7 +100,6 @@ function replacePrompt(line, prompt, isFirst = false) {
 }
 
 function parseArgs(raw) {
-    const commandGroups = ['init', 'debug', 'project']
     let args = raw.split(' ').filter(arg => arg)
     const result = {}
     while (args.length) {
@@ -108,7 +108,12 @@ function parseArgs(raw) {
             const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
             result[opt] = isFlag ? true : args.shift()
         } else {
-            const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt
+            let key = opt
+            if (CLI_GROUPS.includes(opt)) {
+                if (args.length && !args[0].startsWith('-')) {
+                    key = `${opt} ${args.shift()}`
+                }
+            }
             result[key] = null
         }
     }
diff --git a/website/src/components/landing.js b/website/src/components/landing.js
index 64764ad2a..96f9640ce 100644
--- a/website/src/components/landing.js
+++ b/website/src/components/landing.js
@@ -38,8 +38,8 @@ export const LandingSubtitle = ({ children }) => (
 )
 
 export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => (
-    
-        
+    
+        
             {children}
         
     
diff --git a/website/src/styles/accordion.module.sass b/website/src/styles/accordion.module.sass
index a0b736838..8e67bfc29 100644
--- a/website/src/styles/accordion.module.sass
+++ b/website/src/styles/accordion.module.sass
@@ -26,8 +26,11 @@
         border-bottom-right-radius: 0
 
 .icon
-    width: 2rem
-    height: 2rem
+    $width: 2rem
+
+    width: $width
+    height: $width
+    flex: 0 0 $width
     background: var(--color-theme)
     color: var(--color-back)
     border-radius: 50%
diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass
index 134016b0d..9629004b4 100644
--- a/website/src/styles/landing.module.sass
+++ b/website/src/styles/landing.module.sass
@@ -128,14 +128,17 @@
         padding-right: 2rem
 
 @include breakpoint(max, md)
+    .banner
+        padding: 1rem 3rem
+
     .banner-content
         display: block
 
     .banner-text
         padding-top: 0
 
-    .col
-        grid-column: 1 / span 2
+    .grid
+        grid-template-columns: 1fr !important
 
 .banner-button
     margin-bottom: var(--spacing-sm)
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index d545fdb96..77fcdfd81 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -54,23 +54,8 @@ for entity in doc.ents:
     print(entity.text, entity.label_)
 `
 
-/**
- * Compute the overall total counts of models and languages
- */
-function getCounts(langs = []) {
-    return {
-        langs: langs.length,
-        modelLangs: langs.filter(({ models }) => models && !!models.length).length,
-        starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
-        models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
-        starters: langs
-            .map(({ starters }) => (starters ? starters.length : 0))
-            .reduce((a, b) => a + b, 0),
-    }
-}
-
 const Landing = ({ data }) => {
-    const counts = getCounts(data.languages)
+    const { counts } = data
     return (
         <>
             
@@ -345,7 +330,10 @@ const landingQuery = graphql`
             siteMetadata {
                 nightly
                 repo
-                languages {
+                counts {
+                    langs
+                    modelLangs
+                    starterLangs
                     models
                     starters
                 }