mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-19 13:00:35 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
ceb850f099
|
@ -301,6 +301,7 @@ def ensure_pathy(path):
|
|||
|
||||
|
||||
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
|
||||
git_version = get_git_version()
|
||||
if dest.exists():
|
||||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
|
@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
|
|||
# *that* we can do by path.
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
git_version = get_git_version()
|
||||
supports_sparse = git_version >= (2, 22)
|
||||
# This is the "clone, but don't download anything" part.
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
||||
if supports_sparse:
|
||||
cmd += f"--filter=blob:none" # <-- The key bit
|
||||
else:
|
||||
msg.warn(
|
||||
err_old = (
|
||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||
f"that doesn't fully support sparse checkout yet. This means that "
|
||||
f"more files than necessary may be downloaded temporarily. To "
|
||||
f"only download the files needed, upgrade to Git v2.22 or above."
|
||||
f"that doesn't fully support sparse checkout yet."
|
||||
)
|
||||
_attempt_run_command(cmd)
|
||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||
msg.warn(
|
||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||
f"This means that more files than necessary may be downloaded "
|
||||
f"temporarily. To only download the files needed, make sure "
|
||||
f"you're using Git v2.22 or above."
|
||||
)
|
||||
try_run_command(cmd)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
|
||||
ret = _attempt_run_command(cmd)
|
||||
ret = try_run_command(cmd)
|
||||
git_repo = _from_http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
|
@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
|
|||
msg.fail(err, exits=1)
|
||||
if supports_sparse:
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||
_attempt_run_command(cmd)
|
||||
try_run_command(cmd)
|
||||
# And finally, we can checkout our subpath
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
_attempt_run_command(cmd)
|
||||
try_run_command(cmd)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
||||
def get_git_version() -> Tuple[int, int]:
|
||||
ret = _attempt_run_command(["git", "--version"])
|
||||
# TODO: this seems kinda brittle?
|
||||
version = ret.stdout[11:].strip().split(".")
|
||||
def get_git_version(
|
||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||
) -> Tuple[int, int]:
|
||||
"""Get the version of git and raise an error if calling 'git --version' fails.
|
||||
|
||||
error (str): The error message to show.
|
||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||
(0, 0) if the version couldn't be determined.
|
||||
"""
|
||||
ret = try_run_command(["git", "--version"], error=error)
|
||||
stdout = ret.stdout.strip()
|
||||
if not stdout or not stdout.startswith("git version"):
|
||||
return (0, 0)
|
||||
version = stdout[11:].strip().split(".")
|
||||
return (int(version[0]), int(version[1]))
|
||||
|
||||
|
||||
def _attempt_run_command(cmd: Union[str, List[str]]):
|
||||
def try_run_command(
|
||||
cmd: Union[str, List[str]], error: str = "Could not run command"
|
||||
) -> subprocess.CompletedProcess:
|
||||
"""Try running a command and raise an error if it fails.
|
||||
|
||||
cmd (Union[str, List[str]]): The command to run.
|
||||
error (str): The error message.
|
||||
RETURNS (CompletedProcess): The completed process if the command ran.
|
||||
"""
|
||||
try:
|
||||
return run_command(cmd, capture=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
err = f"Could not run command"
|
||||
msg.fail(err)
|
||||
msg.fail(error)
|
||||
print(cmd)
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str:
|
|||
return repo
|
||||
|
||||
|
||||
def string_to_list(value, intify=False):
|
||||
"""Parse a comma-separated string to a list"""
|
||||
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
||||
"""Parse a comma-separated string to a list and account for various
|
||||
formatting options. Mostly used to handle CLI arguments that take a list of
|
||||
comma-separated values.
|
||||
|
||||
value (str): The value to parse.
|
||||
intify (bool): Whether to convert values to ints.
|
||||
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
|
||||
"""
|
||||
if not value:
|
||||
return []
|
||||
if value.startswith("[") and value.endswith("]"):
|
||||
|
|
|
@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
|||
from thinc.api import Model, data_validation
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
from ._util import parse_config_overrides, string_to_list
|
||||
from .. import util
|
||||
|
||||
|
||||
|
|
|
@ -277,7 +277,7 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
|||
|
||||
def ensure_shape(lines):
|
||||
"""Ensure that the first line of the data is the vectors shape.
|
||||
|
||||
|
||||
If it's not, we read in the data and output the shape as the first result,
|
||||
so that the reader doesn't have to deal with the problem.
|
||||
"""
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Optional, Dict, Any
|
||||
import random
|
||||
from typing import Optional
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
|
@ -15,11 +15,10 @@ import typer
|
|||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from ..errors import Errors
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from ..attrs import ID
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -30,9 +29,8 @@ from .. import util
|
|||
def pretrain_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
|
@ -60,13 +58,35 @@ def pretrain_cli(
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/cli#pretrain
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
overrides=config_overrides,
|
||||
interpolate=True
|
||||
)
|
||||
if not config.get("pretraining"):
|
||||
# TODO: What's the solution here? How do we handle optional blocks?
|
||||
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
|
||||
config.to_disk(output_dir / "config.cfg")
|
||||
msg.good("Saved config file in the output directory")
|
||||
|
||||
pretrain(
|
||||
texts_loc,
|
||||
config,
|
||||
output_dir,
|
||||
config_path,
|
||||
config_overrides=overrides,
|
||||
resume_path=resume_path,
|
||||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
|
@ -74,52 +94,22 @@ def pretrain_cli(
|
|||
|
||||
|
||||
def pretrain(
|
||||
texts_loc: Path,
|
||||
config: Config,
|
||||
output_dir: Path,
|
||||
config_path: Path,
|
||||
config_overrides: Dict[str, Any] = {},
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
use_gpu: int=-1
|
||||
):
|
||||
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
pretrain_config = config["pretraining"]
|
||||
if not pretrain_config:
|
||||
# TODO: What's the solution here? How do we handle optional blocks?
|
||||
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
seed = pretrain_config["seed"]
|
||||
if seed is not None:
|
||||
fix_random_seed(seed)
|
||||
if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
|
||||
if config["system"].get("seed") is not None:
|
||||
fix_random_seed(config["system"]["seed"])
|
||||
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
|
||||
use_pytorch_for_gpu_memory()
|
||||
config.to_disk(output_dir / "config.cfg")
|
||||
msg.good("Saved config file in the output directory")
|
||||
if texts_loc != "-": # reading from a file
|
||||
with msg.loading("Loading input texts..."):
|
||||
texts = list(srsly.read_jsonl(texts_loc))
|
||||
random.shuffle(texts)
|
||||
else: # reading from stdin
|
||||
msg.info("Reading input text from stdin...")
|
||||
texts = srsly.read_jsonl("-")
|
||||
|
||||
tok2vec_path = pretrain_config["tok2vec_model"]
|
||||
tok2vec = config
|
||||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
model = create_pretraining_model(nlp, tok2vec, pretrain_config)
|
||||
optimizer = pretrain_config["optimizer"]
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
P_cfg = config["pretraining"]
|
||||
corpus = P_cfg["corpus"]
|
||||
batcher = P_cfg["batcher"]
|
||||
model = create_pretraining_model(nlp, config["pretraining"])
|
||||
optimizer = config["pretraining"]["optimizer"]
|
||||
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
|
@ -147,38 +137,35 @@ def pretrain(
|
|||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
skip_counter = 0
|
||||
objective = create_objective(pretrain_config["objective"])
|
||||
for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
|
||||
batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
|
||||
for batch_id, batch in enumerate(batches):
|
||||
docs, count = make_docs(
|
||||
nlp,
|
||||
batch,
|
||||
max_length=pretrain_config["max_length"],
|
||||
min_length=pretrain_config["min_length"],
|
||||
)
|
||||
skip_counter += count
|
||||
objective = create_objective(P_cfg["objective"])
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P_cfg["max_epochs"]):
|
||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||
docs = ensure_docs(batch)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||
break
|
||||
if pretrain_config["n_save_every"] and (
|
||||
batch_id % pretrain_config["n_save_every"] == 0
|
||||
if P_cfg["n_save_every"] and (
|
||||
batch_id % P_cfg["n_save_every"] == 0
|
||||
):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
if texts_loc != "-":
|
||||
# Reshuffle the texts if texts were loaded from a file
|
||||
random.shuffle(texts)
|
||||
if skip_counter > 0:
|
||||
msg.warn(f"Skipped {skip_counter} empty values")
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
def ensure_docs(examples_or_docs):
|
||||
docs = []
|
||||
for eg_or_doc in examples_or_docs:
|
||||
if isinstance(eg_or_doc, Doc):
|
||||
docs.append(eg_or_doc)
|
||||
else:
|
||||
docs.append(eg_or_doc.reference)
|
||||
return docs
|
||||
|
||||
|
||||
def _resume_model(model, resume_path, epoch_resume):
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
|
@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func):
|
|||
return float(loss)
|
||||
|
||||
|
||||
def make_docs(nlp, batch, min_length, max_length):
|
||||
docs = []
|
||||
skip_count = 0
|
||||
for record in batch:
|
||||
if not isinstance(record, dict):
|
||||
raise TypeError(Errors.E137.format(type=type(record), line=record))
|
||||
if "tokens" in record:
|
||||
words = record["tokens"]
|
||||
if not words:
|
||||
skip_count += 1
|
||||
continue
|
||||
doc = Doc(nlp.vocab, words=words)
|
||||
elif "text" in record:
|
||||
text = record["text"]
|
||||
if not text:
|
||||
skip_count += 1
|
||||
continue
|
||||
doc = nlp.make_doc(text)
|
||||
else:
|
||||
raise ValueError(Errors.E138.format(text=record))
|
||||
if "heads" in record:
|
||||
heads = record["heads"]
|
||||
heads = numpy.asarray(heads, dtype="uint64")
|
||||
heads = heads.reshape((len(doc), 1))
|
||||
doc = doc.from_array([HEAD], heads)
|
||||
if min_length <= len(doc) < max_length:
|
||||
docs.append(doc)
|
||||
return docs, skip_count
|
||||
|
||||
|
||||
def create_objective(config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
|
@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
|
|||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, tok2vec, pretrain_config):
|
||||
def create_pretraining_model(nlp, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
|
@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config):
|
|||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
component = nlp.get_pipe(pretrain_config["component"])
|
||||
if pretrain_config.get("layer"):
|
||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||
else:
|
||||
tok2vec = component.model
|
||||
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
|
@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4):
|
|||
return format_str % figure
|
||||
|
||||
|
||||
def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
|
||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
|
@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
|
|||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if texts_loc != "-": # reading from a file
|
||||
texts_loc = Path(texts_loc)
|
||||
if not texts_loc.exists():
|
||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||
|
||||
for text in srsly.read_jsonl(texts_loc):
|
||||
break
|
||||
else:
|
||||
msg.fail("Input file is empty", texts_loc, exits=1)
|
||||
|
||||
if resume_path is not None:
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if not model_name and not epoch_resume:
|
||||
|
|
|
@ -7,7 +7,7 @@ import requests
|
|||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
|
||||
from .._util import download_file, git_sparse_checkout
|
||||
from .._util import download_file, git_sparse_checkout, get_git_version
|
||||
|
||||
|
||||
@project_cli.command("assets")
|
||||
|
@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None:
|
|||
dest = (project_dir / asset["dest"]).resolve()
|
||||
checksum = asset.get("checksum")
|
||||
if "git" in asset:
|
||||
git_err = (
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||
f"Make sure it's installed and that the executable is available."
|
||||
)
|
||||
get_git_version(error=git_err)
|
||||
if dest.exists():
|
||||
# If there's already a file, check for checksum
|
||||
if checksum and checksum == get_checksum(dest):
|
||||
|
|
|
@ -7,7 +7,7 @@ import re
|
|||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import git_sparse_checkout
|
||||
from .._util import git_sparse_checkout, get_git_version
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
|
@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
|
|||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually and then run:",
|
||||
f"{COMMAND} project init {dest}",
|
||||
exits=1,
|
||||
)
|
||||
git_err = (
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually.",
|
||||
)
|
||||
get_git_version(error=git_err)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
|
|
|
@ -671,6 +671,9 @@ class Errors:
|
|||
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
|
||||
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
|
||||
"that you are providing a list of patterns as `List[List[dict]]`.")
|
||||
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
|
||||
"through token.morph_ instead or add the string to the "
|
||||
"StringStore with `nlp.vocab.strings.add(string)`.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -244,7 +244,8 @@ class Language:
|
|||
self._config["nlp"]["disabled"] = list(self.disabled)
|
||||
self._config["components"] = pipeline
|
||||
if not self._config["training"].get("score_weights"):
|
||||
self._config["training"]["score_weights"] = combine_score_weights(score_weights)
|
||||
combined_score_weights = combine_score_weights(score_weights)
|
||||
self._config["training"]["score_weights"] = combined_score_weights
|
||||
if not srsly.is_json_serializable(self._config):
|
||||
raise ValueError(Errors.E961.format(config=self._config))
|
||||
return self._config
|
||||
|
@ -1166,14 +1167,20 @@ class Language:
|
|||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
valid_examples = False
|
||||
for example in get_examples():
|
||||
if not isinstance(example, Example):
|
||||
err = Errors.E978.format(
|
||||
name="Language.begin_training", types=type(example)
|
||||
)
|
||||
raise ValueError(err)
|
||||
else:
|
||||
valid_examples = True
|
||||
for word in [t.text for t in example.reference]:
|
||||
_ = self.vocab[word] # noqa: F841
|
||||
if not valid_examples:
|
||||
err = Errors.E930.format(name="Language", obj="empty list")
|
||||
raise ValueError(err)
|
||||
if device >= 0: # TODO: do we need this here?
|
||||
require_gpu(device)
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
|
@ -1274,7 +1281,7 @@ class Language:
|
|||
util.logger.debug(doc)
|
||||
eg.predicted = doc
|
||||
results = scorer.score(examples)
|
||||
n_words = sum(len(eg.predicted) for eg in examples)
|
||||
n_words = sum(len(doc) for doc in docs)
|
||||
results["speed"] = n_words / (end_time - start_time)
|
||||
return results
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ subword_features = true
|
|||
@Language.factory(
|
||||
"textcat",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL},
|
||||
default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
|
||||
scores=[
|
||||
"cats_score",
|
||||
"cats_score_desc",
|
||||
|
@ -75,6 +75,7 @@ def make_textcat(
|
|||
name: str,
|
||||
model: Model[List[Doc], List[Floats2d]],
|
||||
labels: Iterable[str],
|
||||
threshold: float,
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels can
|
||||
|
@ -86,8 +87,9 @@ def make_textcat(
|
|||
scores for each category.
|
||||
labels (list): A list of categories to learn. If empty, the model infers the
|
||||
categories from the data.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
"""
|
||||
return TextCategorizer(nlp.vocab, model, name, labels=labels)
|
||||
return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)
|
||||
|
||||
|
||||
class TextCategorizer(Pipe):
|
||||
|
@ -103,6 +105,7 @@ class TextCategorizer(Pipe):
|
|||
name: str = "textcat",
|
||||
*,
|
||||
labels: Iterable[str],
|
||||
threshold: float,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer.
|
||||
|
||||
|
@ -111,6 +114,7 @@ class TextCategorizer(Pipe):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels (Iterable[str]): The labels to use.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
|
@ -118,7 +122,7 @@ class TextCategorizer(Pipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"labels": labels}
|
||||
cfg = {"labels": labels, "threshold": threshold}
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
@property
|
||||
|
@ -371,5 +375,6 @@ class TextCategorizer(Pipe):
|
|||
labels=self.labels,
|
||||
multi_label=self.model.attrs["multi_label"],
|
||||
positive_label=positive_label,
|
||||
threshold=self.cfg["threshold"],
|
||||
**kwargs,
|
||||
)
|
||||
|
|
|
@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel):
|
|||
class ConfigSchemaPretrain(BaseModel):
|
||||
# fmt: off
|
||||
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
|
||||
min_length: StrictInt = Field(..., title="Minimum length of examples")
|
||||
max_length: StrictInt = Field(..., title="Maximum length of examples")
|
||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
|
||||
batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
|
||||
seed: Optional[StrictInt] = Field(..., title="Random seed")
|
||||
use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
|
||||
tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
corpus: Reader = Field(..., title="Reader for the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||
|
||||
# TODO: use a more detailed schema for this?
|
||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
||||
# fmt: on
|
||||
|
|
|
@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
|||
|
||||
|
||||
def _ner_example(ner):
|
||||
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
|
||||
doc = Doc(
|
||||
ner.vocab,
|
||||
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
|
||||
)
|
||||
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
|
||||
return Example.from_dict(doc, gold)
|
||||
|
||||
|
|
|
@ -66,3 +66,31 @@ def test_morph_set(i_has):
|
|||
def test_morph_str(i_has):
|
||||
assert str(i_has[0].morph) == "PronType=prs"
|
||||
assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
|
||||
|
||||
|
||||
def test_morph_property(tokenizer):
|
||||
doc = tokenizer("a dog")
|
||||
|
||||
# set through token.morph_
|
||||
doc[0].morph_ = "PronType=prs"
|
||||
assert doc[0].morph_ == "PronType=prs"
|
||||
assert doc.to_array(["MORPH"])[0] != 0
|
||||
|
||||
# unset with token.morph
|
||||
doc[0].morph = 0
|
||||
assert doc.to_array(["MORPH"])[0] == 0
|
||||
|
||||
# empty morph is equivalent to "_"
|
||||
doc[0].morph_ = ""
|
||||
assert doc[0].morph_ == ""
|
||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||
|
||||
# "_" morph is also equivalent to empty morph
|
||||
doc[0].morph_ = "_"
|
||||
assert doc[0].morph_ == ""
|
||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||
|
||||
# set through existing hash with token.morph
|
||||
tokenizer.vocab.strings.add("Feat=Val")
|
||||
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
|
||||
assert doc[0].morph_ == "Feat=Val"
|
||||
|
|
|
@ -78,7 +78,7 @@ def patterns(en_vocab):
|
|||
"REL_OP": ">",
|
||||
"RIGHT_ID": "fox",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
pattern5 = [
|
||||
|
@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc):
|
|||
assert matches == matches2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
|
||||
)
|
||||
@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
|
||||
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
||||
# two sentences to test that all matches are within the same sentence
|
||||
doc = get_doc(
|
||||
|
@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
|||
for text in ["a", "b", "c", "d", "e"]:
|
||||
pattern = [
|
||||
{"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
|
||||
{"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},},
|
||||
{"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
|
||||
]
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("A", [pattern])
|
||||
|
|
|
@ -54,7 +54,10 @@ def _parser_example(parser):
|
|||
|
||||
|
||||
def _ner_example(ner):
|
||||
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
|
||||
doc = Doc(
|
||||
ner.vocab,
|
||||
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
|
||||
)
|
||||
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
|
||||
return Example.from_dict(doc, gold)
|
||||
|
||||
|
|
|
@ -30,9 +30,10 @@ TRAIN_DATA = [
|
|||
),
|
||||
]
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
nlp = Language()
|
||||
senter = nlp.add_pipe("senter")
|
||||
nlp.add_pipe("senter")
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
|
|
@ -89,7 +89,7 @@ def test_no_label():
|
|||
|
||||
def test_implicit_label():
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
nlp.add_pipe("textcat")
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
|
|
@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
|
|||
# See issue #1105
|
||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
|
||||
textcat.to_bytes(exclude=["vocab"])
|
||||
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets
|
|||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.cli.pretrain import make_docs
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
|
@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json():
|
|||
assert ent.text in ["New York City", "London"]
|
||||
|
||||
|
||||
def test_pretrain_make_docs():
|
||||
nlp = English()
|
||||
|
||||
valid_jsonl_text = {"text": "Some text"}
|
||||
docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
|
||||
assert len(docs) == 1
|
||||
assert skip_count == 0
|
||||
|
||||
valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
|
||||
docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
|
||||
assert len(docs) == 1
|
||||
assert skip_count == 0
|
||||
|
||||
invalid_jsonl_type = 0
|
||||
with pytest.raises(TypeError):
|
||||
make_docs(nlp, [invalid_jsonl_type], 1, 100)
|
||||
|
||||
invalid_jsonl_key = {"invalid": "Does not matter"}
|
||||
with pytest.raises(ValueError):
|
||||
make_docs(nlp, [invalid_jsonl_key], 1, 100)
|
||||
|
||||
empty_jsonl_text = {"text": ""}
|
||||
docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
|
||||
assert len(docs) == 0
|
||||
assert skip_count == 1
|
||||
|
||||
empty_jsonl_tokens = {"tokens": []}
|
||||
docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
|
||||
assert len(docs) == 0
|
||||
assert skip_count == 1
|
||||
|
||||
too_short_jsonl = {"text": "This text is not long enough"}
|
||||
docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
|
||||
assert len(docs) == 0
|
||||
assert skip_count == 0
|
||||
|
||||
too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
|
||||
docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
|
||||
assert len(docs) == 0
|
||||
assert skip_count == 0
|
||||
|
||||
|
||||
def test_project_config_validation_full():
|
||||
config = {
|
||||
"vars": {"some_var": 20},
|
||||
|
|
|
@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
|
|||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||
doc = tokenizer(text)
|
||||
assert [token.text for token in doc] == ["_SPECIAL_", "."]
|
||||
|
||||
|
||||
def test_tokenizer_special_cases_idx(tokenizer):
|
||||
text = "the _ID'X_"
|
||||
tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
|
||||
doc = tokenizer(text)
|
||||
assert doc[1].idx == 4
|
||||
assert doc[2].idx == 7
|
||||
|
|
|
@ -343,8 +343,9 @@ cdef class Tokenizer:
|
|||
for j in range(cached.length):
|
||||
tokens[i + offset + j] = cached.data.tokens[j]
|
||||
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||
idx_offset += cached.data.tokens[j].lex.length + \
|
||||
1 if cached.data.tokens[j].spacy else 0
|
||||
idx_offset += cached.data.tokens[j].lex.length
|
||||
if cached.data.tokens[j].spacy:
|
||||
idx_offset += 1
|
||||
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
||||
i += span_end - span_start
|
||||
offset += span[3]
|
||||
|
|
|
@ -214,9 +214,17 @@ cdef class Token:
|
|||
xp = get_array_module(vector)
|
||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||
|
||||
@property
|
||||
def morph(self):
|
||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||
property morph:
|
||||
def __get__(self):
|
||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||
|
||||
def __set__(self, attr_t morph):
|
||||
if morph == 0:
|
||||
self.c.morph = morph
|
||||
elif morph in self.vocab.strings:
|
||||
self.morph_ = self.vocab.strings[morph]
|
||||
else:
|
||||
raise ValueError(Errors.E1009.format(val=morph))
|
||||
|
||||
property morph_:
|
||||
def __get__(self):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import warnings
|
||||
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
||||
from .. import util
|
||||
from .example import Example
|
||||
|
@ -21,6 +22,36 @@ def create_docbin_reader(
|
|||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
|
||||
|
||||
@util.registry.readers("spacy.JsonlReader.v1")
|
||||
def create_jsonl_reader(
|
||||
path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
|
||||
) -> Callable[["Language"], Iterable[Doc]]:
|
||||
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
|
||||
|
||||
|
||||
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir() and path.parts[-1].endswith(file_type):
|
||||
return [path]
|
||||
orig_path = path
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts and path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif path.parts[-1].endswith(file_type):
|
||||
locs.append(path)
|
||||
if len(locs) == 0:
|
||||
warnings.warn(Warnings.W090.format(path=orig_path))
|
||||
return locs
|
||||
|
||||
|
||||
|
||||
class Corpus:
|
||||
"""Iterate Example objects from a file or directory of DocBin (.spacy)
|
||||
|
@ -47,36 +78,13 @@ class Corpus:
|
|||
*,
|
||||
limit: int = 0,
|
||||
gold_preproc: bool = False,
|
||||
max_length: bool = False,
|
||||
max_length: int = 0,
|
||||
) -> None:
|
||||
self.path = util.ensure_path(path)
|
||||
self.gold_preproc = gold_preproc
|
||||
self.max_length = max_length
|
||||
self.limit = limit
|
||||
|
||||
@staticmethod
|
||||
def walk_corpus(path: Union[str, Path]) -> List[Path]:
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
|
||||
return [path]
|
||||
orig_path = path
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts and path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif path.parts[-1].endswith(FILE_TYPE):
|
||||
locs.append(path)
|
||||
if len(locs) == 0:
|
||||
warnings.warn(Warnings.W090.format(path=orig_path))
|
||||
return locs
|
||||
|
||||
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||
"""Yield examples from the data.
|
||||
|
||||
|
@ -85,11 +93,11 @@ class Corpus:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/corpus#call
|
||||
"""
|
||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
|
||||
ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
|
||||
if self.gold_preproc:
|
||||
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||
else:
|
||||
examples = self.make_examples(nlp, ref_docs, self.max_length)
|
||||
examples = self.make_examples(nlp, ref_docs)
|
||||
yield from examples
|
||||
|
||||
def _make_example(
|
||||
|
@ -108,18 +116,18 @@ class Corpus:
|
|||
return Example(nlp.make_doc(reference.text), reference)
|
||||
|
||||
def make_examples(
|
||||
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
|
||||
self, nlp: "Language", reference_docs: Iterable[Doc]
|
||||
) -> Iterator[Example]:
|
||||
for reference in reference_docs:
|
||||
if len(reference) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(reference) < max_length:
|
||||
elif self.max_length == 0 or len(reference) < self.max_length:
|
||||
yield self._make_example(nlp, reference, False)
|
||||
elif reference.is_sentenced:
|
||||
for ref_sent in reference.sents:
|
||||
if len(ref_sent) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(ref_sent) < max_length:
|
||||
elif self.max_length == 0 or len(ref_sent) < self.max_length:
|
||||
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
||||
|
||||
def make_examples_gold_preproc(
|
||||
|
@ -151,3 +159,57 @@ class Corpus:
|
|||
i += 1
|
||||
if self.limit >= 1 and i >= self.limit:
|
||||
break
|
||||
|
||||
|
||||
class JsonlTexts:
|
||||
"""Iterate Doc objects from a file or directory of jsonl
|
||||
formatted raw text files.
|
||||
|
||||
path (Path): The directory or filename to read from.
|
||||
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||
will be skipped. Defaults to 0, which indicates no limit.
|
||||
|
||||
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||
be skipped. Defaults to 0, which indicates no limit.
|
||||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||
Defaults to 0, which indicates no limit.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/corpus
|
||||
"""
|
||||
file_type = "jsonl"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
limit: int = 0,
|
||||
min_length: int = 0,
|
||||
max_length: int = 0,
|
||||
) -> None:
|
||||
self.path = util.ensure_path(path)
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
self.limit = limit
|
||||
|
||||
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||
"""Yield examples from the data.
|
||||
|
||||
nlp (Language): The current nlp object.
|
||||
YIELDS (Doc): The docs.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/corpus#call
|
||||
"""
|
||||
for loc in walk_corpus(self.path, "jsonl"):
|
||||
records = srsly.read_jsonl(loc)
|
||||
for record in records:
|
||||
doc = nlp.make_doc(record["text"])
|
||||
if self.min_length >= 1 and len(doc) < self.min_length:
|
||||
continue
|
||||
elif self.max_length >= 1 and len(doc) >= self.max_length:
|
||||
continue
|
||||
else:
|
||||
words = [w.text for w in doc]
|
||||
spaces = [bool(w.whitespace_) for w in doc]
|
||||
# We don't *need* an example here, but it seems nice to
|
||||
# make it match the Corpus signature.
|
||||
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
|
||||
|
|
|
@ -36,20 +36,12 @@ def console_logger():
|
|||
keys=list(info["losses"].keys()),
|
||||
)
|
||||
) from None
|
||||
|
||||
try:
|
||||
scores = [
|
||||
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100)
|
||||
for col in score_cols
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict="scores (other)",
|
||||
key=str(e),
|
||||
keys=list(info["other_scores"].keys()),
|
||||
)
|
||||
) from None
|
||||
scores = []
|
||||
for col in score_cols:
|
||||
score = float(info["other_scores"].get(col, 0.0))
|
||||
if col != "speed":
|
||||
score *= 100
|
||||
scores.append("{0:.2f}".format(score))
|
||||
data = (
|
||||
[info["epoch"], info["step"]]
|
||||
+ losses
|
||||
|
|
|
@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str:
|
|||
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||
|
||||
|
||||
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
|
||||
def run_command(
|
||||
command: Union[str, List[str]],
|
||||
*,
|
||||
capture: bool = False,
|
||||
stdin: Optional[Any] = None,
|
||||
) -> Optional[subprocess.CompletedProcess]:
|
||||
"""Run a command on the command line as a subprocess. If the subprocess
|
||||
returns a non-zero exit code, a system exit is performed.
|
||||
|
||||
command (str / List[str]): The command. If provided as a string, the
|
||||
string will be split using shlex.split.
|
||||
stdin (Optional[Any]): stdin to read from or None.
|
||||
capture (bool): Whether to capture the output.
|
||||
RETURNS (Optional[CompletedProcess]): The process object.
|
||||
"""
|
||||
if isinstance(command, str):
|
||||
command = split_command(command)
|
||||
|
@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
|
|||
raise FileNotFoundError(
|
||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||
) from None
|
||||
except subprocess.CalledProcessError as e:
|
||||
# We don't want a duplicate traceback here
|
||||
print(e)
|
||||
sys.exit(1)
|
||||
if ret.returncode != 0:
|
||||
sys.exit(ret.returncode)
|
||||
return ret
|
||||
|
|
|
@ -14,6 +14,7 @@ menu:
|
|||
- ['evaluate', 'evaluate']
|
||||
- ['package', 'package']
|
||||
- ['project', 'project']
|
||||
- ['ray', 'ray']
|
||||
---
|
||||
|
||||
spaCy's CLI provides a range of helpful commands for downloading and training
|
||||
|
@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
|||
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||
|
||||
## ray {#ray new="3"}
|
||||
|
||||
The `spacy ray` CLI includes commands for parallel and distributed computing via
|
||||
[Ray](https://ray.io).
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
To use this command, you need the
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
||||
Installing the package will automatically add the `ray` command to the spaCy
|
||||
CLI.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### ray train {#ray-train tag="command"}
|
||||
|
||||
Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
|
||||
command works just like [`spacy train`](/api/cli#train). For more details and
|
||||
examples, see the usage guide on
|
||||
[parallel training](/usage/training#parallel-training) and the spaCy project
|
||||
[integration](/usage/projects#ray).
|
||||
|
||||
```cli
|
||||
$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy ray train config.cfg --n-workers 2
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
|
|
|
@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters.
|
|||
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
|
||||
> config = {
|
||||
> "labels": [],
|
||||
> "threshold": 0.5,
|
||||
> "model": DEFAULT_TEXTCAT_MODEL,
|
||||
> }
|
||||
> nlp.add_pipe("textcat", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| Setting | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/textcat.py
|
||||
|
@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters.
|
|||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import TextCategorizer
|
||||
> textcat = TextCategorizer(nlp.vocab, model)
|
||||
> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
|
@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and
|
|||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `labels` | The labels to use. ~~Iterable[str]~~ |
|
||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||
|
||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
55
website/docs/images/spacy-ray.svg
Normal file
55
website/docs/images/spacy-ray.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 67 KiB |
|
@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.
|
|||
|
||||
<figure>
|
||||
|
||||
| System | POS | USA | LAS |
|
||||
| System | POS | UAS | LAS |
|
||||
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
|
||||
| spaCy RoBERTa (2020) | | | |
|
||||
| spaCy CNN (2020) | | | |
|
||||
|
|
|
@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
|
|||
|
||||
<Benchmarks />
|
||||
|
||||
<!-- TODO:
|
||||
|
||||
<Project id="benchmarks/penn_treebank">
|
||||
<Project id="benchmarks/parsing_penn_treebank">
|
||||
|
||||
The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
|
||||
our project template.
|
||||
|
||||
</Project>
|
||||
|
||||
-->
|
||||
|
||||
<!-- ## Citing spaCy {#citation}
|
||||
|
||||
<!-- TODO: update -->
|
||||
|
|
|
@ -796,11 +796,9 @@ workflows, including
|
|||
evaluation workflow that lets you compare two different models and their
|
||||
results.
|
||||
|
||||
<Project id="integrations/prodigy">
|
||||
<!-- TODO: <Project id="integrations/prodigy">
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
</Project>
|
||||
</Project> -->
|
||||
|
||||
---
|
||||
|
||||
|
@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components.
|
|||
> #### Installation
|
||||
>
|
||||
> ```bash
|
||||
> $ pip install "spacy_streamlit>=1.0.0a0"
|
||||
> $ pip install "spacy-streamlit>=1.0.0a0"
|
||||
> ```
|
||||
|
||||

|
||||
|
@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
|
|||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: document -->
|
||||
> #### Installation
|
||||
>
|
||||
> ```cli
|
||||
> $ pip install spacy-ray
|
||||
> # Check that the CLI is registered
|
||||
> $ python -m spacy ray --help
|
||||
> ```
|
||||
|
||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||
**distributed applications**. You can use Ray for parallel and distributed
|
||||
training with spaCy via our lightweight
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
|
||||
package is installed in the same environment as spaCy, it will automatically add
|
||||
[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
|
||||
|
||||
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
|
||||
`project.yml` just like the regular training command:
|
||||
|
||||
<!-- prettier-ignore -->
|
||||
```yaml
|
||||
### project.yml
|
||||
- name: "ray"
|
||||
help: "Train a model via parallel training with Ray"
|
||||
script:
|
||||
- "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
|
||||
deps:
|
||||
- "corpus/train.spacy"
|
||||
- "corpus/dev.spacy"
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="integrations/ray">
|
||||
|
||||
</Project> -->
|
||||
|
||||
---
|
||||
|
||||
|
@ -943,12 +973,14 @@ your results.
|
|||
|
||||

|
||||
|
||||
<!-- TODO:
|
||||
|
||||
<Project id="integrations/wandb">
|
||||
|
||||
Get started with tracking your spaCy training runs in Weights & Biases using our
|
||||
project template. It includes a simple config using the `WandbLogger`, as well
|
||||
as a custom logger implementation you can adjust for your specific use case.
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
</Project>
|
||||
|
||||
-->
|
||||
|
|
|
@ -1075,7 +1075,7 @@ relations and tokens we want to match:
|
|||
|
||||
> #### Visualizing the parse
|
||||
>
|
||||
> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects
|
||||
> The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects
|
||||
> and their dependency parse and part-of-speech tags:
|
||||
>
|
||||
> ```python
|
||||
|
|
|
@ -7,7 +7,7 @@ menu:
|
|||
- ['Quickstart', 'quickstart']
|
||||
- ['Config System', 'config']
|
||||
- ['Custom Functions', 'custom-functions']
|
||||
# - ['Parallel Training', 'parallel-training']
|
||||
- ['Parallel Training', 'parallel-training']
|
||||
- ['Internal API', 'api']
|
||||
---
|
||||
|
||||
|
@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|||
return create_model(output_width)
|
||||
```
|
||||
|
||||
## Parallel & distributed training with Ray {#parallel-training}
|
||||
|
||||
> #### Installation
|
||||
>
|
||||
> ```cli
|
||||
> $ pip install spacy-ray
|
||||
> # Check that the CLI is registered
|
||||
> $ python -m spacy ray --help
|
||||
> ```
|
||||
|
||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||
**distributed applications**. You can use Ray to train spaCy on one or more
|
||||
remote machines, potentially speeding up your training process. Parallel
|
||||
training won't always be faster though – it depends on your batch size, models,
|
||||
and hardware.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
To use Ray with spaCy, you need the
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
||||
Installing the package will automatically add the `ray` command to the spaCy
|
||||
CLI.
|
||||
|
||||
</Infobox>
|
||||
|
||||
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
|
||||
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
|
||||
setup. You can optionally set the `--address` option to point to your Ray
|
||||
cluster. If it's not set, Ray will run locally.
|
||||
|
||||
```cli
|
||||
python -m spacy ray train config.cfg --n-workers 2
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="integrations/ray">
|
||||
|
||||
</Project> -->
|
||||
|
||||
### How parallel training works {#parallel-training-details}
|
||||
|
||||
Each worker receives a shard of the **data** and builds a copy of the **model
|
||||
and optimizer** from the [`config.cfg`](#config). It also has a communication
|
||||
channel to **pass gradients and parameters** to the other workers. Additionally,
|
||||
each worker is given ownership of a subset of the parameter arrays. Every
|
||||
parameter array is owned by exactly one worker, and the workers are given a
|
||||
mapping so they know which worker owns which parameter.
|
||||
|
||||

|
||||
|
||||
As training proceeds, every worker will be computing gradients for **all** of
|
||||
the model parameters. When they compute gradients for parameters they don't own,
|
||||
they'll **send them to the worker** that does own that parameter, along with a
|
||||
version identifier so that the owner can decide whether the discard the
|
||||
gradient. Workers use the gradients they receive and the ones they compute
|
||||
locally to update the parameters they own, and then broadcast the updated array
|
||||
and a new version ID to the other workers.
|
||||
|
||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
||||
push their gradient increments and parameter updates, they do not have to pull
|
||||
them and block on the result, so the transfers can happen in the background,
|
||||
overlapped with the actual training work. The workers also do not have to stop
|
||||
and wait for each other ("synchronize") at the start of each batch. This is very
|
||||
useful for spaCy, because spaCy is often trained on long documents, which means
|
||||
**batches can vary in size** significantly. Uneven workloads make synchronous
|
||||
gradient descent inefficient, because if one batch is slow, all of the other
|
||||
workers are stuck waiting for it to complete before they can continue.
|
||||
|
||||
## Internal training API {#api}
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
|
|
@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
|
|||
- [Training & config system](#features-training)
|
||||
- [Custom models](#features-custom-models)
|
||||
- [End-to-end project workflows](#features-projects)
|
||||
- [Parallel training with Ray](#features-parallel-training)
|
||||
- [New built-in components](#features-pipeline-components)
|
||||
- [New custom component API](#features-components)
|
||||
- [Dependency matching](#features-dep-matcher)
|
||||
|
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Parallel and distributed training with Ray {#features-parallel-training}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ pip install spacy-ray
|
||||
> # Check that the CLI is registered
|
||||
> $ python -m spacy ray --help
|
||||
> # Train a pipeline
|
||||
> $ python -m spacy ray train config.cfg --n-workers 2
|
||||
> ```
|
||||
|
||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||
**distributed applications**. You can use Ray to train spaCy on one or more
|
||||
remote machines, potentially speeding up your training process. The Ray
|
||||
integration is powered by a lightweight extension package,
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
|
||||
the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
|
||||
same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
|
||||
parallel training.
|
||||
|
||||

|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
- **Usage: **
|
||||
[Parallel and distributed training](/usage/training#parallel-training),
|
||||
[spaCy Projects integration](/usage/projects#ray)
|
||||
- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
|
||||
- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
|
||||
|
||||
</Infobox>
|
||||
|
||||
### New built-in pipeline components {#features-pipeline-components}
|
||||
|
||||
spaCy v3.0 includes several new trainable and rule-based components that you can
|
||||
|
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
|
||||
|
||||
### New and updated documentation {#new-docs}
|
||||
|
||||
|
|
|
@ -26,11 +26,27 @@ const replacements = {
|
|||
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the overall total counts of models and languages
|
||||
*/
|
||||
function getCounts(langs = []) {
|
||||
return {
|
||||
langs: langs.length,
|
||||
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
|
||||
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
|
||||
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
|
||||
starters: langs
|
||||
.map(({ starters }) => (starters ? starters.length : 0))
|
||||
.reduce((a, b) => a + b, 0),
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
siteMetadata: {
|
||||
...site,
|
||||
sidebars,
|
||||
...models,
|
||||
counts: getCounts(models.languages),
|
||||
universe,
|
||||
nightly: isNightly,
|
||||
binderBranch,
|
||||
|
|
|
@ -1,5 +1,16 @@
|
|||
{
|
||||
"resources": [
|
||||
{
|
||||
"id": "spacy-ray",
|
||||
"title": "spacy-ray",
|
||||
"slogan": "Parallel and distributed training with spaCy and Ray",
|
||||
"description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
|
||||
"github": "explosion/spacy-ray",
|
||||
"pip": "spacy-ray",
|
||||
"category": ["training"],
|
||||
"author": "Explosion / Anyscale",
|
||||
"thumb": "https://i.imgur.com/7so6ZpS.png"
|
||||
},
|
||||
{
|
||||
"id": "spacy-sentence-bert",
|
||||
"title": "spaCy - sentence-transformers",
|
||||
|
@ -2518,14 +2529,14 @@
|
|||
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
|
||||
"pip": "cov-bsv",
|
||||
"code_example": [
|
||||
"import cov_bsv",
|
||||
"",
|
||||
"nlp = cov_bsv.load()",
|
||||
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
|
||||
"",
|
||||
"print(doc.ents)",
|
||||
"print(doc._.cov_classification)",
|
||||
"cov_bsv.visualize_doc(doc)"
|
||||
"import cov_bsv",
|
||||
"",
|
||||
"nlp = cov_bsv.load()",
|
||||
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
|
||||
"",
|
||||
"print(doc.ents)",
|
||||
"print(doc._.cov_classification)",
|
||||
"cov_bsv.visualize_doc(doc)"
|
||||
],
|
||||
"category": ["pipeline", "standalone", "biomedical", "scientific"],
|
||||
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
|
||||
|
|
|
@ -14,6 +14,7 @@ import GitHubCode from './github'
|
|||
import classes from '../styles/code.module.sass'
|
||||
|
||||
const WRAP_THRESHOLD = 30
|
||||
const CLI_GROUPS = ['init', 'debug', 'project', 'ray']
|
||||
|
||||
export default props => (
|
||||
<Pre>
|
||||
|
@ -99,7 +100,6 @@ function replacePrompt(line, prompt, isFirst = false) {
|
|||
}
|
||||
|
||||
function parseArgs(raw) {
|
||||
const commandGroups = ['init', 'debug', 'project']
|
||||
let args = raw.split(' ').filter(arg => arg)
|
||||
const result = {}
|
||||
while (args.length) {
|
||||
|
@ -108,7 +108,12 @@ function parseArgs(raw) {
|
|||
const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
|
||||
result[opt] = isFlag ? true : args.shift()
|
||||
} else {
|
||||
const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt
|
||||
let key = opt
|
||||
if (CLI_GROUPS.includes(opt)) {
|
||||
if (args.length && !args[0].startsWith('-')) {
|
||||
key = `${opt} ${args.shift()}`
|
||||
}
|
||||
}
|
||||
result[key] = null
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,8 +38,8 @@ export const LandingSubtitle = ({ children }) => (
|
|||
)
|
||||
|
||||
export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => (
|
||||
<Content className={classNames(classes.grid, { [classes.blocks]: blocks })}>
|
||||
<Grid cols={cols} narrow={blocks} style={style}>
|
||||
<Content className={classNames({ [classes.blocks]: blocks })}>
|
||||
<Grid cols={cols} narrow={blocks} className={classes.grid} style={style}>
|
||||
{children}
|
||||
</Grid>
|
||||
</Content>
|
||||
|
|
|
@ -26,8 +26,11 @@
|
|||
border-bottom-right-radius: 0
|
||||
|
||||
.icon
|
||||
width: 2rem
|
||||
height: 2rem
|
||||
$width: 2rem
|
||||
|
||||
width: $width
|
||||
height: $width
|
||||
flex: 0 0 $width
|
||||
background: var(--color-theme)
|
||||
color: var(--color-back)
|
||||
border-radius: 50%
|
||||
|
|
|
@ -128,14 +128,17 @@
|
|||
padding-right: 2rem
|
||||
|
||||
@include breakpoint(max, md)
|
||||
.banner
|
||||
padding: 1rem 3rem
|
||||
|
||||
.banner-content
|
||||
display: block
|
||||
|
||||
.banner-text
|
||||
padding-top: 0
|
||||
|
||||
.col
|
||||
grid-column: 1 / span 2
|
||||
.grid
|
||||
grid-template-columns: 1fr !important
|
||||
|
||||
.banner-button
|
||||
margin-bottom: var(--spacing-sm)
|
||||
|
|
|
@ -54,23 +54,8 @@ for entity in doc.ents:
|
|||
print(entity.text, entity.label_)
|
||||
`
|
||||
|
||||
/**
|
||||
* Compute the overall total counts of models and languages
|
||||
*/
|
||||
function getCounts(langs = []) {
|
||||
return {
|
||||
langs: langs.length,
|
||||
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
|
||||
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
|
||||
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
|
||||
starters: langs
|
||||
.map(({ starters }) => (starters ? starters.length : 0))
|
||||
.reduce((a, b) => a + b, 0),
|
||||
}
|
||||
}
|
||||
|
||||
const Landing = ({ data }) => {
|
||||
const counts = getCounts(data.languages)
|
||||
const { counts } = data
|
||||
return (
|
||||
<>
|
||||
<LandingHeader nightly={data.nightly}>
|
||||
|
@ -345,7 +330,10 @@ const landingQuery = graphql`
|
|||
siteMetadata {
|
||||
nightly
|
||||
repo
|
||||
languages {
|
||||
counts {
|
||||
langs
|
||||
modelLangs
|
||||
starterLangs
|
||||
models
|
||||
starters
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user