Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-13 22:31:22 +02:00
commit ceb850f099
40 changed files with 643 additions and 313 deletions

View File

@ -301,6 +301,7 @@ def ensure_pathy(path):
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"): def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
git_version = get_git_version()
if dest.exists(): if dest.exists():
msg.fail("Destination of checkout must not exist", exits=1) msg.fail("Destination of checkout must not exist", exits=1)
if not dest.parent.exists(): if not dest.parent.exists():
@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
# *that* we can do by path. # *that* we can do by path.
# We're using Git and sparse checkout to only clone the files we need # We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
git_version = get_git_version()
supports_sparse = git_version >= (2, 22) supports_sparse = git_version >= (2, 22)
# This is the "clone, but don't download anything" part. # This is the "clone, but don't download anything" part.
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
if supports_sparse: if supports_sparse:
cmd += f"--filter=blob:none" # <-- The key bit cmd += f"--filter=blob:none" # <-- The key bit
else: else:
msg.warn( err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet. This means that " f"that doesn't fully support sparse checkout yet."
f"more files than necessary may be downloaded temporarily. To "
f"only download the files needed, upgrade to Git v2.22 or above."
) )
_attempt_run_command(cmd) err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
msg.warn(
f"{err_unk if git_version == (0, 0) else err_old} "
f"This means that more files than necessary may be downloaded "
f"temporarily. To only download the files needed, make sure "
f"you're using Git v2.22 or above."
)
try_run_command(cmd)
# Now we need to find the missing filenames for the subpath we want. # Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah. # Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}" cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
ret = _attempt_run_command(cmd) ret = try_run_command(cmd)
git_repo = _from_http_to_git(repo) git_repo = _from_http_to_git(repo)
# Now pass those missings into another bit of git internals # Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
msg.fail(err, exits=1) msg.fail(err, exits=1)
if supports_sparse: if supports_sparse:
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
_attempt_run_command(cmd) try_run_command(cmd)
# And finally, we can checkout our subpath # And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
_attempt_run_command(cmd) try_run_command(cmd)
# We need Path(name) to make sure we also support subdirectories # We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest)) shutil.move(str(tmp_dir / Path(subpath)), str(dest))
def get_git_version() -> Tuple[int, int]: def get_git_version(
ret = _attempt_run_command(["git", "--version"]) error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
# TODO: this seems kinda brittle? ) -> Tuple[int, int]:
version = ret.stdout[11:].strip().split(".") """Get the version of git and raise an error if calling 'git --version' fails.
error (str): The error message to show.
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined.
"""
ret = try_run_command(["git", "--version"], error=error)
stdout = ret.stdout.strip()
if not stdout or not stdout.startswith("git version"):
return (0, 0)
version = stdout[11:].strip().split(".")
return (int(version[0]), int(version[1])) return (int(version[0]), int(version[1]))
def _attempt_run_command(cmd: Union[str, List[str]]): def try_run_command(
cmd: Union[str, List[str]], error: str = "Could not run command"
) -> subprocess.CompletedProcess:
"""Try running a command and raise an error if it fails.
cmd (Union[str, List[str]]): The command to run.
error (str): The error message.
RETURNS (CompletedProcess): The completed process if the command ran.
"""
try: try:
return run_command(cmd, capture=True) return run_command(cmd, capture=True)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
err = f"Could not run command" msg.fail(error)
msg.fail(err)
print(cmd) print(cmd)
sys.exit(1) sys.exit(1)
@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str:
return repo return repo
def string_to_list(value, intify=False): def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
"""Parse a comma-separated string to a list""" """Parse a comma-separated string to a list and account for various
formatting options. Mostly used to handle CLI arguments that take a list of
comma-separated values.
value (str): The value to parse.
intify (bool): Whether to convert values to ints.
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
"""
if not value: if not value:
return [] return []
if value.startswith("[") and value.endswith("]"): if value.startswith("[") and value.endswith("]"):

View File

@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation from thinc.api import Model, data_validation
import typer import typer
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list
from .. import util from .. import util

View File

@ -1,10 +1,10 @@
from typing import Optional, Dict, Any from typing import Optional
import random
import numpy import numpy
import time import time
import re import re
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.api import Config
from thinc.api import use_pytorch_for_gpu_memory, require_gpu from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance from thinc.api import CosineDistance, L2Distance
@ -15,11 +15,10 @@ import typer
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code from ._util import import_code
from ..errors import Errors
from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID, HEAD from ..attrs import ID
from .. import util from .. import util
@ -30,9 +29,8 @@ from .. import util
def pretrain_cli( def pretrain_cli(
# fmt: off # fmt: off
ctx: typer.Context, # This is only used to read additional arguments ctx: typer.Context, # This is only used to read additional arguments
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
@ -60,13 +58,35 @@ def pretrain_cli(
DOCS: https://nightly.spacy.io/api/cli#pretrain DOCS: https://nightly.spacy.io/api/cli#pretrain
""" """
overrides = parse_config_overrides(ctx.args) config_overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
config = util.load_config(
config_path,
overrides=config_overrides,
interpolate=True
)
if not config.get("pretraining"):
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
pretrain( pretrain(
texts_loc, config,
output_dir, output_dir,
config_path,
config_overrides=overrides,
resume_path=resume_path, resume_path=resume_path,
epoch_resume=epoch_resume, epoch_resume=epoch_resume,
use_gpu=use_gpu, use_gpu=use_gpu,
@ -74,52 +94,22 @@ def pretrain_cli(
def pretrain( def pretrain(
texts_loc: Path, config: Config,
output_dir: Path, output_dir: Path,
config_path: Path,
config_overrides: Dict[str, Any] = {},
resume_path: Optional[Path] = None, resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None, epoch_resume: Optional[int] = None,
use_gpu: int = -1, use_gpu: int=-1
): ):
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume) if config["system"].get("seed") is not None:
if use_gpu >= 0: fix_random_seed(config["system"]["seed"])
msg.info("Using GPU") if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config)
pretrain_config = config["pretraining"]
if not pretrain_config:
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
seed = pretrain_config["seed"]
if seed is not None:
fix_random_seed(seed)
if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
config.to_disk(output_dir / "config.cfg") nlp, config = util.load_model_from_config(config)
msg.good("Saved config file in the output directory") P_cfg = config["pretraining"]
if texts_loc != "-": # reading from a file corpus = P_cfg["corpus"]
with msg.loading("Loading input texts..."): batcher = P_cfg["batcher"]
texts = list(srsly.read_jsonl(texts_loc)) model = create_pretraining_model(nlp, config["pretraining"])
random.shuffle(texts) optimizer = config["pretraining"]["optimizer"]
else: # reading from stdin
msg.info("Reading input text from stdin...")
texts = srsly.read_jsonl("-")
tok2vec_path = pretrain_config["tok2vec_model"]
tok2vec = config
for subpath in tok2vec_path.split("."):
tok2vec = tok2vec.get(subpath)
model = create_pretraining_model(nlp, tok2vec, pretrain_config)
optimizer = pretrain_config["optimizer"]
# Load in pretrained weights to resume from # Load in pretrained weights to resume from
if resume_path is not None: if resume_path is not None:
@ -147,38 +137,35 @@ def pretrain(
with (output_dir / "log.jsonl").open("a") as file_: with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n") file_.write(srsly.json_dumps(log) + "\n")
skip_counter = 0 objective = create_objective(P_cfg["objective"])
objective = create_objective(pretrain_config["objective"]) # TODO: I think we probably want this to look more like the
for epoch in range(epoch_resume, pretrain_config["max_epochs"]): # 'create_train_batches' function?
batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"]) for epoch in range(epoch_resume, P_cfg["max_epochs"]):
for batch_id, batch in enumerate(batches): for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs, count = make_docs( docs = ensure_docs(batch)
nlp,
batch,
max_length=pretrain_config["max_length"],
min_length=pretrain_config["min_length"],
)
skip_counter += count
loss = make_update(model, docs, optimizer, objective) loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs) progress = tracker.update(epoch, loss, docs)
if progress: if progress:
msg.row(progress, **row_settings) msg.row(progress, **row_settings)
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: if P_cfg["n_save_every"] and (
break batch_id % P_cfg["n_save_every"] == 0
if pretrain_config["n_save_every"] and (
batch_id % pretrain_config["n_save_every"] == 0
): ):
_save_model(epoch, is_temp=True) _save_model(epoch, is_temp=True)
_save_model(epoch) _save_model(epoch)
tracker.epoch_loss = 0.0 tracker.epoch_loss = 0.0
if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file
random.shuffle(texts)
if skip_counter > 0:
msg.warn(f"Skipped {skip_counter} empty values")
msg.good("Successfully finished pretrain") msg.good("Successfully finished pretrain")
def ensure_docs(examples_or_docs):
docs = []
for eg_or_doc in examples_or_docs:
if isinstance(eg_or_doc, Doc):
docs.append(eg_or_doc)
else:
docs.append(eg_or_doc.reference)
return docs
def _resume_model(model, resume_path, epoch_resume): def _resume_model(model, resume_path, epoch_resume):
msg.info(f"Resume training tok2vec from: {resume_path}") msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_: with resume_path.open("rb") as file_:
@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func):
return float(loss) return float(loss)
def make_docs(nlp, batch, min_length, max_length):
docs = []
skip_count = 0
for record in batch:
if not isinstance(record, dict):
raise TypeError(Errors.E137.format(type=type(record), line=record))
if "tokens" in record:
words = record["tokens"]
if not words:
skip_count += 1
continue
doc = Doc(nlp.vocab, words=words)
elif "text" in record:
text = record["text"]
if not text:
skip_count += 1
continue
doc = nlp.make_doc(text)
else:
raise ValueError(Errors.E138.format(text=record))
if "heads" in record:
heads = record["heads"]
heads = numpy.asarray(heads, dtype="uint64")
heads = heads.reshape((len(doc), 1))
doc = doc.from_array([HEAD], heads)
if min_length <= len(doc) < max_length:
docs.append(doc)
return docs, skip_count
def create_objective(config): def create_objective(config):
"""Create the objective for pretraining. """Create the objective for pretraining.
@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
return loss, d_target return loss, d_target
def create_pretraining_model(nlp, tok2vec, pretrain_config): def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto """Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays. takes a batch of Doc objects (as a list), and returns a list of arrays.
@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config):
The actual tok2vec layer is stored as a reference, and only this bit will be The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command. serialized to file and read back in when calling the 'train' command.
""" """
component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"])
else:
tok2vec = component.model
# TODO # TODO
maxout_pieces = 3 maxout_pieces = 3
hidden_size = 300 hidden_size = 300
@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4):
return format_str % figure return format_str % figure
def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume): def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]: if output_dir.exists() and [p for p in output_dir.iterdir()]:
@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
"It is better to use an empty directory or refer to a new output path, " "It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.", "then the new directory will be created for you.",
) )
if texts_loc != "-": # reading from a file
texts_loc = Path(texts_loc)
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
for text in srsly.read_jsonl(texts_loc):
break
else:
msg.fail("Input file is empty", texts_loc, exits=1)
if resume_path is not None: if resume_path is not None:
model_name = re.search(r"model\d+\.bin", str(resume_path)) model_name = re.search(r"model\d+\.bin", str(resume_path))
if not model_name and not epoch_resume: if not model_name and not epoch_resume:

View File

@ -7,7 +7,7 @@ import requests
from ...util import ensure_path, working_dir from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
from .._util import download_file, git_sparse_checkout from .._util import download_file, git_sparse_checkout, get_git_version
@project_cli.command("assets") @project_cli.command("assets")
@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None:
dest = (project_dir / asset["dest"]).resolve() dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum") checksum = asset.get("checksum")
if "git" in asset: if "git" in asset:
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. "
f"Make sure it's installed and that the executable is available."
)
get_git_version(error=git_err)
if dest.exists(): if dest.exists():
# If there's already a file, check for checksum # If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest): if checksum and checksum == get_checksum(dest):

View File

@ -7,7 +7,7 @@ import re
from ... import about from ... import about
from ...util import ensure_path from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_sparse_checkout from .._util import git_sparse_checkout, get_git_version
@project_cli.command("clone") @project_cli.command("clone")
@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
dest (Path): Local destination of cloned directory. dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from. repo (str): URL of the repo to clone from.
""" """
try: git_err = (
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) f"Cloning spaCy project templates requires Git and the 'git' command. ",
except Exception: f"To clone a project without Git, copy the files from the '{name}' "
msg.fail( f"directory in the {repo} to {dest} manually.",
f"Cloning spaCy project templates requires Git and the 'git' command. ", )
f"To clone a project without Git, copy the files from the '{name}' " get_git_version(error=git_err)
f"directory in the {repo} to {dest} manually and then run:",
f"{COMMAND} project init {dest}",
exits=1,
)
if not dest: if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists(): if dest.exists():

View File

@ -671,6 +671,9 @@ class Errors:
E1007 = ("Unsupported DependencyMatcher operator '{op}'.") E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check " E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
"that you are providing a list of patterns as `List[List[dict]]`.") "that you are providing a list of patterns as `List[List[dict]]`.")
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
"through token.morph_ instead or add the string to the "
"StringStore with `nlp.vocab.strings.add(string)`.")
@add_codes @add_codes

View File

@ -244,7 +244,8 @@ class Language:
self._config["nlp"]["disabled"] = list(self.disabled) self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline self._config["components"] = pipeline
if not self._config["training"].get("score_weights"): if not self._config["training"].get("score_weights"):
self._config["training"]["score_weights"] = combine_score_weights(score_weights) combined_score_weights = combine_score_weights(score_weights)
self._config["training"]["score_weights"] = combined_score_weights
if not srsly.is_json_serializable(self._config): if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config)) raise ValueError(Errors.E961.format(config=self._config))
return self._config return self._config
@ -1166,14 +1167,20 @@ class Language:
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples)) err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err) raise ValueError(err)
valid_examples = False
for example in get_examples(): for example in get_examples():
if not isinstance(example, Example): if not isinstance(example, Example):
err = Errors.E978.format( err = Errors.E978.format(
name="Language.begin_training", types=type(example) name="Language.begin_training", types=type(example)
) )
raise ValueError(err) raise ValueError(err)
else:
valid_examples = True
for word in [t.text for t in example.reference]: for word in [t.text for t in example.reference]:
_ = self.vocab[word] # noqa: F841 _ = self.vocab[word] # noqa: F841
if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err)
if device >= 0: # TODO: do we need this here? if device >= 0: # TODO: do we need this here?
require_gpu(device) require_gpu(device)
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
@ -1274,7 +1281,7 @@ class Language:
util.logger.debug(doc) util.logger.debug(doc)
eg.predicted = doc eg.predicted = doc
results = scorer.score(examples) results = scorer.score(examples)
n_words = sum(len(eg.predicted) for eg in examples) n_words = sum(len(doc) for doc in docs)
results["speed"] = n_words / (end_time - start_time) results["speed"] = n_words / (end_time - start_time)
return results return results

View File

@ -56,7 +56,7 @@ subword_features = true
@Language.factory( @Language.factory(
"textcat", "textcat",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL}, default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
scores=[ scores=[
"cats_score", "cats_score",
"cats_score_desc", "cats_score_desc",
@ -75,6 +75,7 @@ def make_textcat(
name: str, name: str,
model: Model[List[Doc], List[Floats2d]], model: Model[List[Doc], List[Floats2d]],
labels: Iterable[str], labels: Iterable[str],
threshold: float,
) -> "TextCategorizer": ) -> "TextCategorizer":
"""Create a TextCategorizer compoment. The text categorizer predicts categories """Create a TextCategorizer compoment. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels can over a whole document. It can learn one or more labels, and the labels can
@ -86,8 +87,9 @@ def make_textcat(
scores for each category. scores for each category.
labels (list): A list of categories to learn. If empty, the model infers the labels (list): A list of categories to learn. If empty, the model infers the
categories from the data. categories from the data.
threshold (float): Cutoff to consider a prediction "positive".
""" """
return TextCategorizer(nlp.vocab, model, name, labels=labels) return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)
class TextCategorizer(Pipe): class TextCategorizer(Pipe):
@ -103,6 +105,7 @@ class TextCategorizer(Pipe):
name: str = "textcat", name: str = "textcat",
*, *,
labels: Iterable[str], labels: Iterable[str],
threshold: float,
) -> None: ) -> None:
"""Initialize a text categorizer. """Initialize a text categorizer.
@ -111,6 +114,7 @@ class TextCategorizer(Pipe):
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
labels (Iterable[str]): The labels to use. labels (Iterable[str]): The labels to use.
threshold (float): Cutoff to consider a prediction "positive".
DOCS: https://nightly.spacy.io/api/textcategorizer#init DOCS: https://nightly.spacy.io/api/textcategorizer#init
""" """
@ -118,7 +122,7 @@ class TextCategorizer(Pipe):
self.model = model self.model = model
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": labels} cfg = {"labels": labels, "threshold": threshold}
self.cfg = dict(cfg) self.cfg = dict(cfg)
@property @property
@ -371,5 +375,6 @@ class TextCategorizer(Pipe):
labels=self.labels, labels=self.labels,
multi_label=self.model.attrs["multi_label"], multi_label=self.model.attrs["multi_label"],
positive_label=positive_label, positive_label=positive_label,
threshold=self.cfg["threshold"],
**kwargs, **kwargs,
) )

View File

@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel):
class ConfigSchemaPretrain(BaseModel): class ConfigSchemaPretrain(BaseModel):
# fmt: off # fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
min_length: StrictInt = Field(..., title="Minimum length of examples")
max_length: StrictInt = Field(..., title="Maximum length of examples")
dropout: StrictFloat = Field(..., title="Dropout rate") dropout: StrictFloat = Field(..., title="Dropout rate")
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
seed: Optional[StrictInt] = Field(..., title="Random seed")
use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model")
optimizer: Optimizer = Field(..., title="The optimizer to use") optimizer: Optimizer = Field(..., title="The optimizer to use")
corpus: Reader = Field(..., title="Reader for the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
# TODO: use a more detailed schema for this? # TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective") objective: Dict[str, Any] = Field(..., title="Pretraining objective")
# fmt: on # fmt: on

View File

@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL
def _ner_example(ner): def _ner_example(ner):
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) doc = Doc(
ner.vocab,
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
)
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold) return Example.from_dict(doc, gold)

View File

@ -66,3 +66,31 @@ def test_morph_set(i_has):
def test_morph_str(i_has): def test_morph_str(i_has):
assert str(i_has[0].morph) == "PronType=prs" assert str(i_has[0].morph) == "PronType=prs"
assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin" assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
def test_morph_property(tokenizer):
doc = tokenizer("a dog")
# set through token.morph_
doc[0].morph_ = "PronType=prs"
assert doc[0].morph_ == "PronType=prs"
assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph
doc[0].morph = 0
assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_"
doc[0].morph_ = ""
assert doc[0].morph_ == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# "_" morph is also equivalent to empty morph
doc[0].morph_ = "_"
assert doc[0].morph_ == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# set through existing hash with token.morph
tokenizer.vocab.strings.add("Feat=Val")
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
assert doc[0].morph_ == "Feat=Val"

View File

@ -78,7 +78,7 @@ def patterns(en_vocab):
"REL_OP": ">", "REL_OP": ">",
"RIGHT_ID": "fox", "RIGHT_ID": "fox",
"RIGHT_ATTRS": {"ORTH": "fox"}, "RIGHT_ATTRS": {"ORTH": "fox"},
} },
] ]
pattern5 = [ pattern5 = [
@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc):
assert matches == matches2 assert matches == matches2
@pytest.mark.parametrize( @pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
"op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
)
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
# two sentences to test that all matches are within the same sentence # two sentences to test that all matches are within the same sentence
doc = get_doc( doc = get_doc(
@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
for text in ["a", "b", "c", "d", "e"]: for text in ["a", "b", "c", "d", "e"]:
pattern = [ pattern = [
{"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}}, {"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
{"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},}, {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
] ]
matcher = DependencyMatcher(en_vocab) matcher = DependencyMatcher(en_vocab)
matcher.add("A", [pattern]) matcher.add("A", [pattern])

View File

@ -54,7 +54,10 @@ def _parser_example(parser):
def _ner_example(ner): def _ner_example(ner):
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) doc = Doc(
ner.vocab,
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
)
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold) return Example.from_dict(doc, gold)

View File

@ -30,9 +30,10 @@ TRAIN_DATA = [
), ),
] ]
def test_begin_training_examples(): def test_begin_training_examples():
nlp = Language() nlp = Language()
senter = nlp.add_pipe("senter") nlp.add_pipe("senter")
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

View File

@ -89,7 +89,7 @@ def test_no_label():
def test_implicit_label(): def test_implicit_label():
nlp = Language() nlp = Language()
textcat = nlp.add_pipe("textcat") nlp.add_pipe("textcat")
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

View File

@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105 # See issue #1105
cfg = {"model": DEFAULT_TEXTCAT_MODEL} cfg = {"model": DEFAULT_TEXTCAT_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"] model = registry.make_from_config(cfg, validate=True)["model"]
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
textcat.to_bytes(exclude=["vocab"]) textcat.to_bytes(exclude=["vocab"])

View File

@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json():
assert ent.text in ["New York City", "London"] assert ent.text in ["New York City", "London"]
def test_pretrain_make_docs():
nlp = English()
valid_jsonl_text = {"text": "Some text"}
docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
assert len(docs) == 1
assert skip_count == 0
valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
assert len(docs) == 1
assert skip_count == 0
invalid_jsonl_type = 0
with pytest.raises(TypeError):
make_docs(nlp, [invalid_jsonl_type], 1, 100)
invalid_jsonl_key = {"invalid": "Does not matter"}
with pytest.raises(ValueError):
make_docs(nlp, [invalid_jsonl_key], 1, 100)
empty_jsonl_text = {"text": ""}
docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
assert len(docs) == 0
assert skip_count == 1
empty_jsonl_tokens = {"tokens": []}
docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
assert len(docs) == 0
assert skip_count == 1
too_short_jsonl = {"text": "This text is not long enough"}
docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
assert len(docs) == 0
assert skip_count == 0
too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
assert len(docs) == 0
assert skip_count == 0
def test_project_config_validation_full(): def test_project_config_validation_full():
config = { config = {
"vars": {"some_var": 20}, "vars": {"some_var": 20},

View File

@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
doc = tokenizer(text) doc = tokenizer(text)
assert [token.text for token in doc] == ["_SPECIAL_", "."] assert [token.text for token in doc] == ["_SPECIAL_", "."]
def test_tokenizer_special_cases_idx(tokenizer):
text = "the _ID'X_"
tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
doc = tokenizer(text)
assert doc[1].idx == 4
assert doc[2].idx == 7

View File

@ -343,8 +343,9 @@ cdef class Tokenizer:
for j in range(cached.length): for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j] tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += cached.data.tokens[j].lex.length + \ idx_offset += cached.data.tokens[j].lex.length
1 if cached.data.tokens[j].spacy else 0 if cached.data.tokens[j].spacy:
idx_offset += 1
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span_end - span_start i += span_end - span_start
offset += span[3] offset += span[3]

View File

@ -214,9 +214,17 @@ cdef class Token:
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
@property property morph:
def morph(self): def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph) return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, attr_t morph):
if morph == 0:
self.c.morph = morph
elif morph in self.vocab.strings:
self.morph_ = self.vocab.strings[morph]
else:
raise ValueError(Errors.E1009.format(val=morph))
property morph_: property morph_:
def __get__(self): def __get__(self):

View File

@ -1,6 +1,7 @@
import warnings import warnings
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from pathlib import Path from pathlib import Path
import srsly
from .. import util from .. import util
from .example import Example from .example import Example
@ -21,6 +22,36 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]: ) -> Callable[["Language"], Iterable[Example]]:
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit) return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
@util.registry.readers("spacy.JsonlReader.v1")
def create_jsonl_reader(
path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
) -> Callable[["Language"], Iterable[Doc]]:
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir() and path.parts[-1].endswith(file_type):
return [path]
orig_path = path
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts and path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith(file_type):
locs.append(path)
if len(locs) == 0:
warnings.warn(Warnings.W090.format(path=orig_path))
return locs
class Corpus: class Corpus:
"""Iterate Example objects from a file or directory of DocBin (.spacy) """Iterate Example objects from a file or directory of DocBin (.spacy)
@ -47,36 +78,13 @@ class Corpus:
*, *,
limit: int = 0, limit: int = 0,
gold_preproc: bool = False, gold_preproc: bool = False,
max_length: bool = False, max_length: int = 0,
) -> None: ) -> None:
self.path = util.ensure_path(path) self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc self.gold_preproc = gold_preproc
self.max_length = max_length self.max_length = max_length
self.limit = limit self.limit = limit
@staticmethod
def walk_corpus(path: Union[str, Path]) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
return [path]
orig_path = path
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts and path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith(FILE_TYPE):
locs.append(path)
if len(locs) == 0:
warnings.warn(Warnings.W090.format(path=orig_path))
return locs
def __call__(self, nlp: "Language") -> Iterator[Example]: def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data. """Yield examples from the data.
@ -85,11 +93,11 @@ class Corpus:
DOCS: https://nightly.spacy.io/api/corpus#call DOCS: https://nightly.spacy.io/api/corpus#call
""" """
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path)) ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
if self.gold_preproc: if self.gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs) examples = self.make_examples_gold_preproc(nlp, ref_docs)
else: else:
examples = self.make_examples(nlp, ref_docs, self.max_length) examples = self.make_examples(nlp, ref_docs)
yield from examples yield from examples
def _make_example( def _make_example(
@ -108,18 +116,18 @@ class Corpus:
return Example(nlp.make_doc(reference.text), reference) return Example(nlp.make_doc(reference.text), reference)
def make_examples( def make_examples(
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0 self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]: ) -> Iterator[Example]:
for reference in reference_docs: for reference in reference_docs:
if len(reference) == 0: if len(reference) == 0:
continue continue
elif max_length == 0 or len(reference) < max_length: elif self.max_length == 0 or len(reference) < self.max_length:
yield self._make_example(nlp, reference, False) yield self._make_example(nlp, reference, False)
elif reference.is_sentenced: elif reference.is_sentenced:
for ref_sent in reference.sents: for ref_sent in reference.sents:
if len(ref_sent) == 0: if len(ref_sent) == 0:
continue continue
elif max_length == 0 or len(ref_sent) < max_length: elif self.max_length == 0 or len(ref_sent) < self.max_length:
yield self._make_example(nlp, ref_sent.as_doc(), False) yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc( def make_examples_gold_preproc(
@ -151,3 +159,57 @@ class Corpus:
i += 1 i += 1
if self.limit >= 1 and i >= self.limit: if self.limit >= 1 and i >= self.limit:
break break
class JsonlTexts:
"""Iterate Doc objects from a file or directory of jsonl
formatted raw text files.
path (Path): The directory or filename to read from.
min_length (int): Minimum document length (in tokens). Shorter documents
will be skipped. Defaults to 0, which indicates no limit.
max_length (int): Maximum document length (in tokens). Longer documents will
be skipped. Defaults to 0, which indicates no limit.
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
Defaults to 0, which indicates no limit.
DOCS: https://nightly.spacy.io/api/corpus
"""
file_type = "jsonl"
def __init__(
self,
path: Union[str, Path],
*,
limit: int = 0,
min_length: int = 0,
max_length: int = 0,
) -> None:
self.path = util.ensure_path(path)
self.min_length = min_length
self.max_length = max_length
self.limit = limit
def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data.
nlp (Language): The current nlp object.
YIELDS (Doc): The docs.
DOCS: https://nightly.spacy.io/api/corpus#call
"""
for loc in walk_corpus(self.path, "jsonl"):
records = srsly.read_jsonl(loc)
for record in records:
doc = nlp.make_doc(record["text"])
if self.min_length >= 1 and len(doc) < self.min_length:
continue
elif self.max_length >= 1 and len(doc) >= self.max_length:
continue
else:
words = [w.text for w in doc]
spaces = [bool(w.whitespace_) for w in doc]
# We don't *need* an example here, but it seems nice to
# make it match the Corpus signature.
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))

View File

@ -36,20 +36,12 @@ def console_logger():
keys=list(info["losses"].keys()), keys=list(info["losses"].keys()),
) )
) from None ) from None
scores = []
try: for col in score_cols:
scores = [ score = float(info["other_scores"].get(col, 0.0))
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100) if col != "speed":
for col in score_cols score *= 100
] scores.append("{0:.2f}".format(score))
except KeyError as e:
raise KeyError(
Errors.E983.format(
dict="scores (other)",
key=str(e),
keys=list(info["other_scores"].keys()),
)
) from None
data = ( data = (
[info["epoch"], info["step"]] [info["epoch"], info["step"]]
+ losses + losses

View File

@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str:
return " ".join(shlex.quote(cmd) for cmd in command) return " ".join(shlex.quote(cmd) for cmd in command)
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None): def run_command(
command: Union[str, List[str]],
*,
capture: bool = False,
stdin: Optional[Any] = None,
) -> Optional[subprocess.CompletedProcess]:
"""Run a command on the command line as a subprocess. If the subprocess """Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed. returns a non-zero exit code, a system exit is performed.
command (str / List[str]): The command. If provided as a string, the command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split. string will be split using shlex.split.
stdin (Optional[Any]): stdin to read from or None.
capture (bool): Whether to capture the output.
RETURNS (Optional[CompletedProcess]): The process object.
""" """
if isinstance(command, str): if isinstance(command, str):
command = split_command(command) command = split_command(command)
@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
raise FileNotFoundError( raise FileNotFoundError(
Errors.E970.format(str_command=" ".join(command), tool=command[0]) Errors.E970.format(str_command=" ".join(command), tool=command[0])
) from None ) from None
except subprocess.CalledProcessError as e:
# We don't want a duplicate traceback here
print(e)
sys.exit(1)
if ret.returncode != 0: if ret.returncode != 0:
sys.exit(ret.returncode) sys.exit(ret.returncode)
return ret return ret

View File

@ -14,6 +14,7 @@ menu:
- ['evaluate', 'evaluate'] - ['evaluate', 'evaluate']
- ['package', 'package'] - ['package', 'package']
- ['project', 'project'] - ['project', 'project']
- ['ray', 'ray']
--- ---
spaCy's CLI provides a range of helpful commands for downloading and training spaCy's CLI provides a range of helpful commands for downloading and training
@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | | `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
## ray {#ray new="3"}
The `spacy ray` CLI includes commands for parallel and distributed computing via
[Ray](https://ray.io).
<Infobox variant="warning">
To use this command, you need the
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
Installing the package will automatically add the `ray` command to the spaCy
CLI.
</Infobox>
### ray train {#ray-train tag="command"}
Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
command works just like [`spacy train`](/api/cli#train). For more details and
examples, see the usage guide on
[parallel training](/usage/training#parallel-training) and the spaCy project
[integration](/usage/projects#ray).
```cli
$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
```
> #### Example
>
> ```cli
> $ python -m spacy ray train config.cfg --n-workers 2
> ```
| Name | Description |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |

View File

@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL > from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
> config = { > config = {
> "labels": [], > "labels": [],
> "threshold": 0.5,
> "model": DEFAULT_TEXTCAT_MODEL, > "model": DEFAULT_TEXTCAT_MODEL,
> } > }
> nlp.add_pipe("textcat", config=config) > nlp.add_pipe("textcat", config=config)
> ``` > ```
| Setting | Description | | Setting | Description |
| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | | `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/textcat.py %%GITHUB_SPACY/spacy/pipeline/textcat.py
@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters.
> >
> # Construction from class > # Construction from class
> from spacy.pipeline import TextCategorizer > from spacy.pipeline import TextCategorizer
> textcat = TextCategorizer(nlp.vocab, model) > textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
> ``` > ```
Create a new pipeline instance. In your application, you would normally use a Create a new pipeline instance. In your application, you would normally use a
@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `labels` | The labels to use. ~~Iterable[str]~~ | | `labels` | The labels to use. ~~Iterable[str]~~ |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"} ## TextCategorizer.\_\_call\_\_ {#call tag="method"}

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 67 KiB

View File

@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.
<figure> <figure>
| System | POS | USA | LAS | | System | POS | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: | | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
| spaCy RoBERTa (2020) | | | | | spaCy RoBERTa (2020) | | | |
| spaCy CNN (2020) | | | | | spaCy CNN (2020) | | | |

View File

@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
<Benchmarks /> <Benchmarks />
<!-- TODO: <Project id="benchmarks/parsing_penn_treebank">
<Project id="benchmarks/penn_treebank">
The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
our project template. our project template.
</Project> </Project>
-->
<!-- ## Citing spaCy {#citation} <!-- ## Citing spaCy {#citation}
<!-- TODO: update --> <!-- TODO: update -->

View File

@ -796,11 +796,9 @@ workflows, including
evaluation workflow that lets you compare two different models and their evaluation workflow that lets you compare two different models and their
results. results.
<Project id="integrations/prodigy"> <!-- TODO: <Project id="integrations/prodigy">
<!-- TODO: --> </Project> -->
</Project>
--- ---
@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components.
> #### Installation > #### Installation
> >
> ```bash > ```bash
> $ pip install "spacy_streamlit>=1.0.0a0" > $ pip install "spacy-streamlit>=1.0.0a0"
> ``` > ```
![](../images/spacy-streamlit.png) ![](../images/spacy-streamlit.png)
@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
<Infobox title="This section is still under construction" emoji="🚧" variant="warning"> <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
</Infobox> </Infobox>
<!-- TODO: document --> > #### Installation
>
> ```cli
> $ pip install spacy-ray
> # Check that the CLI is registered
> $ python -m spacy ray --help
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray for parallel and distributed
training with spaCy via our lightweight
[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
package is installed in the same environment as spaCy, it will automatically add
[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
`project.yml` just like the regular training command:
<!-- prettier-ignore -->
```yaml
### project.yml
- name: "ray"
help: "Train a model via parallel training with Ray"
script:
- "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
deps:
- "corpus/train.spacy"
- "corpus/dev.spacy"
```
<!-- TODO: <Project id="integrations/ray">
</Project> -->
--- ---
@ -943,12 +973,14 @@ your results.
![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values') ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')
<!-- TODO:
<Project id="integrations/wandb"> <Project id="integrations/wandb">
Get started with tracking your spaCy training runs in Weights & Biases using our Get started with tracking your spaCy training runs in Weights & Biases using our
project template. It includes a simple config using the `WandbLogger`, as well project template. It includes a simple config using the `WandbLogger`, as well
as a custom logger implementation you can adjust for your specific use case. as a custom logger implementation you can adjust for your specific use case.
<!-- TODO: -->
</Project> </Project>
-->

View File

@ -1075,7 +1075,7 @@ relations and tokens we want to match:
> #### Visualizing the parse > #### Visualizing the parse
> >
> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects > The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects
> and their dependency parse and part-of-speech tags: > and their dependency parse and part-of-speech tags:
> >
> ```python > ```python

View File

@ -7,7 +7,7 @@ menu:
- ['Quickstart', 'quickstart'] - ['Quickstart', 'quickstart']
- ['Config System', 'config'] - ['Config System', 'config']
- ['Custom Functions', 'custom-functions'] - ['Custom Functions', 'custom-functions']
# - ['Parallel Training', 'parallel-training'] - ['Parallel Training', 'parallel-training']
- ['Internal API', 'api'] - ['Internal API', 'api']
--- ---
@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width) return create_model(output_width)
``` ```
## Parallel & distributed training with Ray {#parallel-training}
> #### Installation
>
> ```cli
> $ pip install spacy-ray
> # Check that the CLI is registered
> $ python -m spacy ray --help
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray to train spaCy on one or more
remote machines, potentially speeding up your training process. Parallel
training won't always be faster though it depends on your batch size, models,
and hardware.
<Infobox variant="warning">
To use Ray with spaCy, you need the
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
Installing the package will automatically add the `ray` command to the spaCy
CLI.
</Infobox>
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
setup. You can optionally set the `--address` option to point to your Ray
cluster. If it's not set, Ray will run locally.
```cli
python -m spacy ray train config.cfg --n-workers 2
```
<!-- TODO: <Project id="integrations/ray">
</Project> -->
### How parallel training works {#parallel-training-details}
Each worker receives a shard of the **data** and builds a copy of the **model
and optimizer** from the [`config.cfg`](#config). It also has a communication
channel to **pass gradients and parameters** to the other workers. Additionally,
each worker is given ownership of a subset of the parameter arrays. Every
parameter array is owned by exactly one worker, and the workers are given a
mapping so they know which worker owns which parameter.
![Illustration of setup](../images/spacy-ray.svg)
As training proceeds, every worker will be computing gradients for **all** of
the model parameters. When they compute gradients for parameters they don't own,
they'll **send them to the worker** that does own that parameter, along with a
version identifier so that the owner can decide whether the discard the
gradient. Workers use the gradients they receive and the ones they compute
locally to update the parameters they own, and then broadcast the updated array
and a new version ID to the other workers.
This training procedure is **asynchronous** and **non-blocking**. Workers always
push their gradient increments and parameter updates, they do not have to pull
them and block on the result, so the transfers can happen in the background,
overlapped with the actual training work. The workers also do not have to stop
and wait for each other ("synchronize") at the start of each batch. This is very
useful for spaCy, because spaCy is often trained on long documents, which means
**batches can vary in size** significantly. Uneven workloads make synchronous
gradient descent inefficient, because if one batch is slow, all of the other
workers are stuck waiting for it to complete before they can continue.
## Internal training API {#api} ## Internal training API {#api}
<Infobox variant="warning"> <Infobox variant="warning">

View File

@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
- [Training & config system](#features-training) - [Training & config system](#features-training)
- [Custom models](#features-custom-models) - [Custom models](#features-custom-models)
- [End-to-end project workflows](#features-projects) - [End-to-end project workflows](#features-projects)
- [Parallel training with Ray](#features-parallel-training)
- [New built-in components](#features-pipeline-components) - [New built-in components](#features-pipeline-components)
- [New custom component API](#features-components) - [New custom component API](#features-components)
- [Dependency matching](#features-dep-matcher) - [Dependency matching](#features-dep-matcher)
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
</Infobox> </Infobox>
### Parallel and distributed training with Ray {#features-parallel-training}
> #### Example
>
> ```cli
> $ pip install spacy-ray
> # Check that the CLI is registered
> $ python -m spacy ray --help
> # Train a pipeline
> $ python -m spacy ray train config.cfg --n-workers 2
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray to train spaCy on one or more
remote machines, potentially speeding up your training process. The Ray
integration is powered by a lightweight extension package,
[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
parallel training.
![Illustration of setup](../images/spacy-ray.svg)
<Infobox title="Details & Documentation" emoji="📖" list>
- **Usage: **
[Parallel and distributed training](/usage/training#parallel-training),
[spaCy Projects integration](/usage/projects#ray)
- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
</Infobox>
### New built-in pipeline components {#features-pipeline-components} ### New built-in pipeline components {#features-pipeline-components}
spaCy v3.0 includes several new trainable and rule-based components that you can spaCy v3.0 includes several new trainable and rule-based components that you can
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. | | [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
### New and updated documentation {#new-docs} ### New and updated documentation {#new-docs}

View File

@ -26,11 +26,27 @@ const replacements = {
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
} }
/**
* Compute the overall total counts of models and languages
*/
function getCounts(langs = []) {
return {
langs: langs.length,
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
starters: langs
.map(({ starters }) => (starters ? starters.length : 0))
.reduce((a, b) => a + b, 0),
}
}
module.exports = { module.exports = {
siteMetadata: { siteMetadata: {
...site, ...site,
sidebars, sidebars,
...models, ...models,
counts: getCounts(models.languages),
universe, universe,
nightly: isNightly, nightly: isNightly,
binderBranch, binderBranch,

View File

@ -1,5 +1,16 @@
{ {
"resources": [ "resources": [
{
"id": "spacy-ray",
"title": "spacy-ray",
"slogan": "Parallel and distributed training with spaCy and Ray",
"description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
"github": "explosion/spacy-ray",
"pip": "spacy-ray",
"category": ["training"],
"author": "Explosion / Anyscale",
"thumb": "https://i.imgur.com/7so6ZpS.png"
},
{ {
"id": "spacy-sentence-bert", "id": "spacy-sentence-bert",
"title": "spaCy - sentence-transformers", "title": "spaCy - sentence-transformers",
@ -2518,14 +2529,14 @@
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.", "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
"pip": "cov-bsv", "pip": "cov-bsv",
"code_example": [ "code_example": [
"import cov_bsv", "import cov_bsv",
"", "",
"nlp = cov_bsv.load()", "nlp = cov_bsv.load()",
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'", "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
"", "",
"print(doc.ents)", "print(doc.ents)",
"print(doc._.cov_classification)", "print(doc._.cov_classification)",
"cov_bsv.visualize_doc(doc)" "cov_bsv.visualize_doc(doc)"
], ],
"category": ["pipeline", "standalone", "biomedical", "scientific"], "category": ["pipeline", "standalone", "biomedical", "scientific"],
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"], "tags": ["clinical", "epidemiology", "covid-19", "surveillance"],

View File

@ -14,6 +14,7 @@ import GitHubCode from './github'
import classes from '../styles/code.module.sass' import classes from '../styles/code.module.sass'
const WRAP_THRESHOLD = 30 const WRAP_THRESHOLD = 30
const CLI_GROUPS = ['init', 'debug', 'project', 'ray']
export default props => ( export default props => (
<Pre> <Pre>
@ -99,7 +100,6 @@ function replacePrompt(line, prompt, isFirst = false) {
} }
function parseArgs(raw) { function parseArgs(raw) {
const commandGroups = ['init', 'debug', 'project']
let args = raw.split(' ').filter(arg => arg) let args = raw.split(' ').filter(arg => arg)
const result = {} const result = {}
while (args.length) { while (args.length) {
@ -108,7 +108,12 @@ function parseArgs(raw) {
const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-')) const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
result[opt] = isFlag ? true : args.shift() result[opt] = isFlag ? true : args.shift()
} else { } else {
const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt let key = opt
if (CLI_GROUPS.includes(opt)) {
if (args.length && !args[0].startsWith('-')) {
key = `${opt} ${args.shift()}`
}
}
result[key] = null result[key] = null
} }
} }

View File

@ -38,8 +38,8 @@ export const LandingSubtitle = ({ children }) => (
) )
export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => ( export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => (
<Content className={classNames(classes.grid, { [classes.blocks]: blocks })}> <Content className={classNames({ [classes.blocks]: blocks })}>
<Grid cols={cols} narrow={blocks} style={style}> <Grid cols={cols} narrow={blocks} className={classes.grid} style={style}>
{children} {children}
</Grid> </Grid>
</Content> </Content>

View File

@ -26,8 +26,11 @@
border-bottom-right-radius: 0 border-bottom-right-radius: 0
.icon .icon
width: 2rem $width: 2rem
height: 2rem
width: $width
height: $width
flex: 0 0 $width
background: var(--color-theme) background: var(--color-theme)
color: var(--color-back) color: var(--color-back)
border-radius: 50% border-radius: 50%

View File

@ -128,14 +128,17 @@
padding-right: 2rem padding-right: 2rem
@include breakpoint(max, md) @include breakpoint(max, md)
.banner
padding: 1rem 3rem
.banner-content .banner-content
display: block display: block
.banner-text .banner-text
padding-top: 0 padding-top: 0
.col .grid
grid-column: 1 / span 2 grid-template-columns: 1fr !important
.banner-button .banner-button
margin-bottom: var(--spacing-sm) margin-bottom: var(--spacing-sm)

View File

@ -54,23 +54,8 @@ for entity in doc.ents:
print(entity.text, entity.label_) print(entity.text, entity.label_)
` `
/**
* Compute the overall total counts of models and languages
*/
function getCounts(langs = []) {
return {
langs: langs.length,
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
starters: langs
.map(({ starters }) => (starters ? starters.length : 0))
.reduce((a, b) => a + b, 0),
}
}
const Landing = ({ data }) => { const Landing = ({ data }) => {
const counts = getCounts(data.languages) const { counts } = data
return ( return (
<> <>
<LandingHeader nightly={data.nightly}> <LandingHeader nightly={data.nightly}>
@ -345,7 +330,10 @@ const landingQuery = graphql`
siteMetadata { siteMetadata {
nightly nightly
repo repo
languages { counts {
langs
modelLangs
starterLangs
models models
starters starters
} }