mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-21 10:24:26 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
e22de2e69d
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a33,<8.0.0a40",
|
"thinc>=8.0.0a34,<8.0.0a40",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a33,<8.0.0a40
|
thinc>=8.0.0a34,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a33,<8.0.0a40
|
thinc>=8.0.0a34,<8.0.0a40
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a33,<8.0.0a40
|
thinc>=8.0.0a34,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -308,6 +308,31 @@ def git_checkout(
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
msg.fail("Destination of checkout must not exist", exits=1)
|
||||||
if not dest.parent.exists():
|
if not dest.parent.exists():
|
||||||
raise IOError("Parent of destination of checkout must exist")
|
raise IOError("Parent of destination of checkout must exist")
|
||||||
|
|
||||||
|
if sparse and git_version >= (2, 22):
|
||||||
|
return git_sparse_checkout(repo, subpath, dest, branch)
|
||||||
|
elif sparse:
|
||||||
|
# Only show warnings if the user explicitly wants sparse checkout but
|
||||||
|
# the Git version doesn't support it
|
||||||
|
err_old = (
|
||||||
|
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||||
|
f"that doesn't fully support sparse checkout yet."
|
||||||
|
)
|
||||||
|
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||||
|
msg.warn(
|
||||||
|
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||||
|
f"This means that more files than necessary may be downloaded "
|
||||||
|
f"temporarily. To only download the files needed, make sure "
|
||||||
|
f"you're using Git v2.22 or above."
|
||||||
|
)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||||
|
ret = run_command(cmd, capture=True)
|
||||||
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
|
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
|
||||||
|
|
||||||
|
def git_sparse_checkout(repo, subpath, dest, branch):
|
||||||
# We're using Git, partial clone and sparse checkout to
|
# We're using Git, partial clone and sparse checkout to
|
||||||
# only clone the files we need
|
# only clone the files we need
|
||||||
# This ends up being RIDICULOUS. omg.
|
# This ends up being RIDICULOUS. omg.
|
||||||
|
@ -324,47 +349,31 @@ def git_checkout(
|
||||||
# *that* we can do by path.
|
# *that* we can do by path.
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
supports_sparse = git_version >= (2, 22)
|
|
||||||
use_sparse = supports_sparse and sparse
|
|
||||||
# This is the "clone, but don't download anything" part.
|
# This is the "clone, but don't download anything" part.
|
||||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
cmd = (
|
||||||
if use_sparse:
|
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||||
cmd += f"--filter=blob:none" # <-- The key bit
|
f"-b {branch} --filter=blob:none"
|
||||||
# Only show warnings if the user explicitly wants sparse checkout but
|
)
|
||||||
# the Git version doesn't support it
|
run_command(cmd)
|
||||||
elif sparse:
|
|
||||||
err_old = (
|
|
||||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
|
||||||
f"that doesn't fully support sparse checkout yet."
|
|
||||||
)
|
|
||||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
|
||||||
msg.warn(
|
|
||||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
|
||||||
f"This means that more files than necessary may be downloaded "
|
|
||||||
f"temporarily. To only download the files needed, make sure "
|
|
||||||
f"you're using Git v2.22 or above."
|
|
||||||
)
|
|
||||||
try_run_command(cmd)
|
|
||||||
# Now we need to find the missing filenames for the subpath we want.
|
# Now we need to find the missing filenames for the subpath we want.
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
|
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||||
ret = try_run_command(cmd)
|
ret = run_command(cmd, capture=True)
|
||||||
git_repo = _from_http_to_git(repo)
|
git_repo = _from_http_to_git(repo)
|
||||||
# Now pass those missings into another bit of git internals
|
# Now pass those missings into another bit of git internals
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
if use_sparse and not missings:
|
if not missings:
|
||||||
err = (
|
err = (
|
||||||
f"Could not find any relevant files for '{subpath}'. "
|
f"Could not find any relevant files for '{subpath}'. "
|
||||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||||
f"and branch {branch}?"
|
f"and branch {branch}?"
|
||||||
)
|
)
|
||||||
msg.fail(err, exits=1)
|
msg.fail(err, exits=1)
|
||||||
if use_sparse:
|
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
run_command(cmd, capture=True)
|
||||||
try_run_command(cmd)
|
|
||||||
# And finally, we can checkout our subpath
|
# And finally, we can checkout our subpath
|
||||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||||
try_run_command(cmd)
|
run_command(cmd, capture=True)
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
|
||||||
|
@ -378,7 +387,7 @@ def get_git_version(
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
(0, 0) if the version couldn't be determined.
|
(0, 0) if the version couldn't be determined.
|
||||||
"""
|
"""
|
||||||
ret = try_run_command(["git", "--version"], error=error)
|
ret = run_command("git --version", capture=True)
|
||||||
stdout = ret.stdout.strip()
|
stdout = ret.stdout.strip()
|
||||||
if not stdout or not stdout.startswith("git version"):
|
if not stdout or not stdout.startswith("git version"):
|
||||||
return (0, 0)
|
return (0, 0)
|
||||||
|
@ -386,23 +395,6 @@ def get_git_version(
|
||||||
return (int(version[0]), int(version[1]))
|
return (int(version[0]), int(version[1]))
|
||||||
|
|
||||||
|
|
||||||
def try_run_command(
|
|
||||||
cmd: Union[str, List[str]], error: str = "Could not run command"
|
|
||||||
) -> subprocess.CompletedProcess:
|
|
||||||
"""Try running a command and raise an error if it fails.
|
|
||||||
|
|
||||||
cmd (Union[str, List[str]]): The command to run.
|
|
||||||
error (str): The error message.
|
|
||||||
RETURNS (CompletedProcess): The completed process if the command ran.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return run_command(cmd, capture=True)
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
msg.fail(error)
|
|
||||||
print(cmd)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def _from_http_to_git(repo: str) -> str:
|
def _from_http_to_git(repo: str) -> str:
|
||||||
if repo.startswith("http://"):
|
if repo.startswith("http://"):
|
||||||
repo = repo.replace(r"http://", r"https://")
|
repo = repo.replace(r"http://", r"https://")
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||||
|
@ -53,7 +53,12 @@ def debug_model_cli(
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=config_overrides)
|
config = util.load_config(
|
||||||
|
config_path, overrides=config_overrides, interpolate=True
|
||||||
|
)
|
||||||
|
allocator = config["training"]["gpu_allocator"]
|
||||||
|
if use_gpu >= 0 and allocator:
|
||||||
|
set_gpu_allocator(allocator)
|
||||||
nlp, config = util.load_model_from_config(config_path)
|
nlp, config = util.load_model_from_config(config_path)
|
||||||
seed = config["training"]["seed"]
|
seed = config["training"]["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
|
|
|
@ -110,7 +110,7 @@ def package(
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"])
|
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||||
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
|
|
||||||
|
|
|
@ -4,10 +4,9 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Config
|
from thinc.api import require_gpu, set_gpu_allocator
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||||
from thinc.api import CosineDistance, L2Distance
|
from thinc.api import Config, CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
@ -32,7 +31,7 @@ def pretrain_cli(
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
@ -99,10 +98,12 @@ def pretrain(
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
):
|
):
|
||||||
if config["system"].get("seed") is not None:
|
if config["training"]["seed"] is not None:
|
||||||
fix_random_seed(config["system"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
|
allocator = config["training"]["gpu_allocator"]
|
||||||
use_pytorch_for_gpu_memory()
|
if use_gpu >= 0 and allocator:
|
||||||
|
set_gpu_allocator(allocator)
|
||||||
|
|
||||||
nlp, config = util.load_model_from_config(config)
|
nlp, config = util.load_model_from_config(config)
|
||||||
P_cfg = config["pretraining"]
|
P_cfg = config["pretraining"]
|
||||||
corpus = dot_to_object(config, P_cfg["corpus"])
|
corpus = dot_to_object(config, P_cfg["corpus"])
|
||||||
|
|
|
@ -59,8 +59,9 @@ def project_run(
|
||||||
for dep in cmd.get("deps", []):
|
for dep in cmd.get("deps", []):
|
||||||
if not (project_dir / dep).exists():
|
if not (project_dir / dep).exists():
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
|
err_help = "Maybe you forgot to run the 'project assets' command?"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
msg.fail(err, **err_kwargs)
|
msg.fail(err, err_help, **err_kwargs)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
rerun = check_rerun(current_dir, cmd)
|
rerun = check_rerun(current_dir, cmd)
|
||||||
if not rerun and not force:
|
if not rerun and not force:
|
||||||
|
@ -144,7 +145,7 @@ def run_commands(
|
||||||
if not silent:
|
if not silent:
|
||||||
print(f"Running command: {join_command(command)}")
|
print(f"Running command: {join_command(command)}")
|
||||||
if not dry:
|
if not dry:
|
||||||
run_command(command)
|
run_command(command, capture=False)
|
||||||
|
|
||||||
|
|
||||||
def validate_subcommand(
|
def validate_subcommand(
|
||||||
|
|
|
@ -8,7 +8,11 @@ train = ""
|
||||||
dev = ""
|
dev = ""
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
|
{% if use_transformer -%}
|
||||||
|
gpu_allocator = "pytorch"
|
||||||
|
{% else -%}
|
||||||
|
gpu_allocator = null
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "{{ lang }}"
|
lang = "{{ lang }}"
|
||||||
|
|
|
@ -6,8 +6,7 @@ from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.schedules
|
import thinc.schedules
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
||||||
from thinc.api import Config, Optimizer
|
|
||||||
import random
|
import random
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
@ -29,7 +28,7 @@ def train_cli(
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
||||||
|
@ -79,15 +78,16 @@ def train(
|
||||||
config = util.load_config(
|
config = util.load_config(
|
||||||
config_path, overrides=config_overrides, interpolate=True
|
config_path, overrides=config_overrides, interpolate=True
|
||||||
)
|
)
|
||||||
if config.get("training", {}).get("seed") is not None:
|
if config["training"]["seed"] is not None:
|
||||||
fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
|
allocator = config["training"]["gpu_allocator"]
|
||||||
# It feels kind of weird to not have a default for this.
|
if use_gpu >= 0 and allocator:
|
||||||
use_pytorch_for_gpu_memory()
|
set_gpu_allocator(allocator)
|
||||||
# Use original config here before it's resolved to functions
|
# Use original config here before it's resolved to functions
|
||||||
sourced_components = get_sourced_components(config)
|
sourced_components = get_sourced_components(config)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
nlp, config = util.load_model_from_config(config)
|
nlp, config = util.load_model_from_config(config)
|
||||||
|
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
||||||
if config["training"]["vectors"] is not None:
|
if config["training"]["vectors"] is not None:
|
||||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||||
|
|
|
@ -6,13 +6,12 @@ init_tok2vec = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
seed = 0
|
seed = 0
|
||||||
use_pytorch_for_gpu_memory = false
|
gpu_allocator = null
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = null
|
lang = null
|
||||||
pipeline = []
|
pipeline = []
|
||||||
disabled = []
|
disabled = []
|
||||||
load_vocab_data = true
|
|
||||||
before_creation = null
|
before_creation = null
|
||||||
after_creation = null
|
after_creation = null
|
||||||
after_pipeline_creation = null
|
after_pipeline_creation = null
|
||||||
|
@ -52,12 +51,14 @@ limit = 0
|
||||||
# Training hyper-parameters and additional features.
|
# Training hyper-parameters and additional features.
|
||||||
[training]
|
[training]
|
||||||
seed = ${system.seed}
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
accumulate_gradient = 1
|
accumulate_gradient = 1
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
raw_text = ${paths.raw}
|
raw_text = ${paths.raw}
|
||||||
vectors = null
|
vectors = null
|
||||||
|
lookups = null
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
max_epochs = 0
|
max_epochs = 0
|
||||||
|
@ -75,7 +76,6 @@ train_corpus = "corpora.train"
|
||||||
[training.logger]
|
[training.logger]
|
||||||
@loggers = "spacy.ConsoleLogger.v1"
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
|
|
|
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
from .lookups import load_lookups
|
||||||
|
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
|
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
return tokenizer_factory
|
return tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||||
|
def load_lookups_data(lang, tables):
|
||||||
|
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||||
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
|
return lookups
|
||||||
|
|
||||||
|
|
||||||
class Language:
|
class Language:
|
||||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||||
and pass the instance around your application.
|
and pass the instance around your application.
|
||||||
|
@ -152,7 +160,6 @@ class Language:
|
||||||
self.lang,
|
self.lang,
|
||||||
self.Defaults,
|
self.Defaults,
|
||||||
vectors_name=vectors_name,
|
vectors_name=vectors_name,
|
||||||
load_data=self._config["nlp"]["load_vocab_data"],
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
|
|
|
@ -8,6 +8,7 @@ from collections import defaultdict
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
|
|
||||||
from .attrs import NAMES
|
from .attrs import NAMES
|
||||||
|
from .lookups import Lookups
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
|
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
|
@ -207,6 +209,7 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
|
max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
|
||||||
eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
|
eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
|
||||||
seed: Optional[StrictInt] = Field(..., title="Random seed")
|
seed: Optional[StrictInt] = Field(..., title="Random seed")
|
||||||
|
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||||
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
|
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
|
@ -227,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
|
||||||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||||
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
||||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
|
||||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||||
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
||||||
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
|
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
|
||||||
|
|
|
@ -69,7 +69,6 @@ def test_util_dot_section():
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
pipeline = ["textcat"]
|
pipeline = ["textcat"]
|
||||||
load_vocab_data = false
|
|
||||||
|
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
|
@ -95,15 +94,13 @@ def test_util_dot_section():
|
||||||
# not exclusive_classes
|
# not exclusive_classes
|
||||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||||
# Test that default values got overwritten
|
# Test that default values got overwritten
|
||||||
assert not en_config["nlp"]["load_vocab_data"]
|
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||||
# Test proper functioning of 'dot_to_object'
|
# Test proper functioning of 'dot_to_object'
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
dot_to_object(en_config, "nlp.unknownattribute")
|
dot_to_object(en_config, "nlp.unknownattribute")
|
||||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
|
||||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
|
||||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -253,6 +253,14 @@ def load_vectors_into_model(
|
||||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||||
|
|
||||||
|
|
||||||
|
def load_vocab_data_into_model(
|
||||||
|
nlp: "Language", *, lookups: Optional["Lookups"] = None
|
||||||
|
) -> None:
|
||||||
|
"""Load vocab data."""
|
||||||
|
if lookups:
|
||||||
|
nlp.vocab.lookups = lookups
|
||||||
|
|
||||||
|
|
||||||
def load_model(
|
def load_model(
|
||||||
name: Union[str, Path],
|
name: Union[str, Path],
|
||||||
*,
|
*,
|
||||||
|
@ -651,8 +659,8 @@ def join_command(command: List[str]) -> str:
|
||||||
def run_command(
|
def run_command(
|
||||||
command: Union[str, List[str]],
|
command: Union[str, List[str]],
|
||||||
*,
|
*,
|
||||||
capture: bool = False,
|
|
||||||
stdin: Optional[Any] = None,
|
stdin: Optional[Any] = None,
|
||||||
|
capture: bool = False,
|
||||||
) -> Optional[subprocess.CompletedProcess]:
|
) -> Optional[subprocess.CompletedProcess]:
|
||||||
"""Run a command on the command line as a subprocess. If the subprocess
|
"""Run a command on the command line as a subprocess. If the subprocess
|
||||||
returns a non-zero exit code, a system exit is performed.
|
returns a non-zero exit code, a system exit is performed.
|
||||||
|
@ -660,33 +668,46 @@ def run_command(
|
||||||
command (str / List[str]): The command. If provided as a string, the
|
command (str / List[str]): The command. If provided as a string, the
|
||||||
string will be split using shlex.split.
|
string will be split using shlex.split.
|
||||||
stdin (Optional[Any]): stdin to read from or None.
|
stdin (Optional[Any]): stdin to read from or None.
|
||||||
capture (bool): Whether to capture the output.
|
capture (bool): Whether to capture the output and errors. If False,
|
||||||
|
the stdout and stderr will not be redirected, and if there's an error,
|
||||||
|
sys.exit will be called with the returncode. You should use capture=False
|
||||||
|
when you want to turn over execution to the command, and capture=True
|
||||||
|
when you want to run the command more like a function.
|
||||||
RETURNS (Optional[CompletedProcess]): The process object.
|
RETURNS (Optional[CompletedProcess]): The process object.
|
||||||
"""
|
"""
|
||||||
if isinstance(command, str):
|
if isinstance(command, str):
|
||||||
command = split_command(command)
|
cmd_list = split_command(command)
|
||||||
|
cmd_str = command
|
||||||
|
else:
|
||||||
|
cmd_list = command
|
||||||
|
cmd_str = " ".join(command)
|
||||||
try:
|
try:
|
||||||
ret = subprocess.run(
|
ret = subprocess.run(
|
||||||
command,
|
cmd_list,
|
||||||
env=os.environ.copy(),
|
env=os.environ.copy(),
|
||||||
input=stdin,
|
input=stdin,
|
||||||
encoding="utf8",
|
encoding="utf8",
|
||||||
check=True,
|
check=False,
|
||||||
stdout=subprocess.PIPE if capture else None,
|
stdout=subprocess.PIPE if capture else None,
|
||||||
stderr=subprocess.PIPE if capture else None,
|
stderr=subprocess.STDOUT if capture else None,
|
||||||
)
|
)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
# Indicates the *command* wasn't found, it's an error before the command
|
||||||
|
# is run.
|
||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
|
||||||
) from None
|
) from None
|
||||||
except subprocess.CalledProcessError as e:
|
if ret.returncode != 0 and capture:
|
||||||
# We don't want a duplicate traceback here so we're making sure the
|
message = f"Error running command:\n\n{cmd_str}\n\n"
|
||||||
# CalledProcessError isn't re-raised. We also print both the string
|
message += f"Subprocess exited with status {ret.returncode}"
|
||||||
# message and the stderr, in case the error only has one of them.
|
if ret.stdout is not None:
|
||||||
print(e.stderr)
|
message += f"\n\nProcess log (stdout and stderr):\n\n"
|
||||||
print(e)
|
message += ret.stdout
|
||||||
sys.exit(1)
|
error = subprocess.SubprocessError(message)
|
||||||
if ret.returncode != 0:
|
error.ret = ret
|
||||||
|
error.command = cmd_str
|
||||||
|
raise error
|
||||||
|
elif ret.returncode != 0:
|
||||||
sys.exit(ret.returncode)
|
sys.exit(ret.returncode)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ cdef class Vocab:
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cpdef public Morphology morphology
|
cpdef public Morphology morphology
|
||||||
cpdef public object vectors
|
cpdef public object vectors
|
||||||
cpdef public object lookups
|
cpdef public object _lookups
|
||||||
cpdef public object writing_system
|
cpdef public object writing_system
|
||||||
cpdef public object get_noun_chunks
|
cpdef public object get_noun_chunks
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
|
|
|
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||||
|
|
||||||
|
|
||||||
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
def create_vocab(lang, defaults, vectors_name=None):
|
||||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||||
# with lexeme data, if available
|
# with lexeme data, if available
|
||||||
if load_data:
|
|
||||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
|
||||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
|
||||||
else:
|
|
||||||
lookups = Lookups()
|
|
||||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||||
# This is messy, but it's the minimal working fix to Issue #639.
|
# This is messy, but it's the minimal working fix to Issue #639.
|
||||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||||
|
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
||||||
lex_attrs[NORM] = util.add_lookups(
|
lex_attrs[NORM] = util.add_lookups(
|
||||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||||
BASE_NORMS,
|
BASE_NORMS,
|
||||||
lookups.get_table("lexeme_norm", {}),
|
|
||||||
)
|
)
|
||||||
return Vocab(
|
return Vocab(
|
||||||
lex_attr_getters=lex_attrs,
|
lex_attr_getters=lex_attrs,
|
||||||
lookups=lookups,
|
|
||||||
writing_system=defaults.writing_system,
|
writing_system=defaults.writing_system,
|
||||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||||
vectors_name=vectors_name,
|
vectors_name=vectors_name,
|
||||||
|
@ -424,6 +417,19 @@ cdef class Vocab:
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
return orth in self.vectors
|
||||||
|
|
||||||
|
property lookups:
|
||||||
|
def __get__(self):
|
||||||
|
return self._lookups
|
||||||
|
|
||||||
|
def __set__(self, lookups):
|
||||||
|
self._lookups = lookups
|
||||||
|
if lookups.has_table("lexeme_norm"):
|
||||||
|
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||||
|
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
||||||
|
self.lookups.get_table("lexeme_norm"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
|
|
@ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
||||||
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||||
|
@ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||||
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
@ -893,8 +895,6 @@ what you need. By default, spaCy's
|
||||||
can provide any other repo (public or private) that you have access to using the
|
can provide any other repo (public or private) that you have access to using the
|
||||||
`--repo` option.
|
`--repo` option.
|
||||||
|
|
||||||
<!-- TODO: update example once we've decided on repo structure -->
|
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
|
$ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
|
||||||
```
|
```
|
||||||
|
@ -902,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy project clone some_example
|
> $ python -m spacy project clone pipelines/ner_wikiner
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> Clone from custom repo:
|
> Clone from custom repo:
|
||||||
|
|
|
@ -60,7 +60,6 @@ your config and check that it's valid, you can run the
|
||||||
> [nlp]
|
> [nlp]
|
||||||
> lang = "en"
|
> lang = "en"
|
||||||
> pipeline = ["tagger", "parser", "ner"]
|
> pipeline = ["tagger", "parser", "ner"]
|
||||||
> load_vocab_data = true
|
|
||||||
> before_creation = null
|
> before_creation = null
|
||||||
> after_creation = null
|
> after_creation = null
|
||||||
> after_pipeline_creation = null
|
> after_pipeline_creation = null
|
||||||
|
@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and
|
||||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
|
||||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
|
@ -190,7 +188,9 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
|
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
||||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
||||||
|
| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
|
||||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
|
@ -475,7 +475,7 @@ lexical data.
|
||||||
Here's an example of the 20 most frequent lexemes in the English training data:
|
Here's an example of the 20 most frequent lexemes in the English training data:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl
|
%%GITHUB_SPACY/extra/example_data/vocab-data.jsonl
|
||||||
```
|
```
|
||||||
|
|
||||||
## Pipeline meta {#meta}
|
## Pipeline meta {#meta}
|
||||||
|
|
|
@ -145,9 +145,10 @@ pipelines.
|
||||||
> nlp = spacy.load("en_core_web_sm")
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------- |
|
| ----------- | ------------------------------------------------ |
|
||||||
| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
|
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||||
|
| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
|
||||||
|
|
||||||
### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
|
### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
|
||||||
|
|
||||||
|
@ -164,9 +165,10 @@ and _before_ loading any pipelines.
|
||||||
> nlp = spacy.load("en_core_web_sm")
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------- |
|
| ----------- | ------------------------------------------------ |
|
||||||
| **RETURNS** | `True` ~~bool~~ |
|
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||||
|
| **RETURNS** | `True` ~~bool~~ |
|
||||||
|
|
||||||
## displaCy {#displacy source="spacy/displacy"}
|
## displaCy {#displacy source="spacy/displacy"}
|
||||||
|
|
||||||
|
@ -456,6 +458,16 @@ remain in the config file stored on your local system.
|
||||||
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
||||||
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
||||||
|
|
||||||
|
<Project id="integrations/wandb">
|
||||||
|
|
||||||
|
Get started with tracking your spaCy training runs in Weights & Biases using our
|
||||||
|
project template. It trains on the IMDB Movie Review Dataset and includes a
|
||||||
|
simple config with the built-in `WandbLogger`, as well as a custom example of
|
||||||
|
creating variants of the config for a simple hyperparameter grid search and
|
||||||
|
logging the results.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
## Readers {#readers source="spacy/training/corpus.py" new="3"}
|
## Readers {#readers source="spacy/training/corpus.py" new="3"}
|
||||||
|
|
||||||
Corpus readers are registered functions that load data and return a function
|
Corpus readers are registered functions that load data and return a function
|
||||||
|
|
|
@ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register
|
||||||
yourself. For details on how to get started with training your own model, check
|
yourself. For details on how to get started with training your own model, check
|
||||||
out the [training quickstart](/usage/training#quickstart).
|
out the [training quickstart](/usage/training#quickstart).
|
||||||
|
|
||||||
<!-- TODO:
|
<!-- TODO: <Project id="en_core_trf_lg">
|
||||||
<Project id="en_core_trf_lg">
|
|
||||||
|
|
||||||
The easiest way to get started is to clone a transformers-based project
|
The easiest way to get started is to clone a transformers-based project
|
||||||
template. Swap in your data, edit the settings and hyperparameters and train,
|
template. Swap in your data, edit the settings and hyperparameters and train,
|
||||||
|
@ -623,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
|
||||||
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
|
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
|
||||||
expect the same types of objects, although for pretraining your corpus does not
|
expect the same types of objects, although for pretraining your corpus does not
|
||||||
need to have any annotations, so you will often use a different reader, such as
|
need to have any annotations, so you will often use a different reader, such as
|
||||||
the [`JsonlReader`](/api/toplevel#jsonlreader).
|
the [`JsonlReader`](/api/top-level#jsonlreader).
|
||||||
|
|
||||||
> #### Raw text format
|
> #### Raw text format
|
||||||
>
|
>
|
||||||
|
@ -655,6 +654,16 @@ and pass in optional config overrides, like the path to the raw text file:
|
||||||
$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
|
$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The following defaults are used for the `[pretraining]` block and merged into
|
||||||
|
your existing config when you run [`init config`](/api/cli#init-config) or
|
||||||
|
[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
|
||||||
|
you can [configure](#pretraining-configure) the settings and hyperparameters or
|
||||||
|
change the [objective](#pretraining-details).
|
||||||
|
|
||||||
|
```ini
|
||||||
|
%%GITHUB_SPACY/spacy/default_config_pretraining.cfg
|
||||||
|
```
|
||||||
|
|
||||||
### How pretraining works {#pretraining-details}
|
### How pretraining works {#pretraining-details}
|
||||||
|
|
||||||
The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
|
The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
|
||||||
|
|
|
@ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy
|
||||||
right up to **current state-of-the-art**. You can also use a CPU-optimized
|
right up to **current state-of-the-art**. You can also use a CPU-optimized
|
||||||
pipeline, which is less accurate but much cheaper to run.
|
pipeline, which is less accurate but much cheaper to run.
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: update benchmarks and intro -->
|
||||||
|
|
||||||
> #### Evaluation details
|
> #### Evaluation details
|
||||||
>
|
>
|
||||||
|
@ -68,6 +68,6 @@ our project template.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
<!-- ## Citing spaCy {#citation}
|
<!-- TODO: ## Citing spaCy {#citation}
|
||||||
|
|
||||||
<!-- TODO: update -->
|
-->
|
||||||
|
|
|
@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
Note that when using a PyTorch or Tensorflow model, it is recommended to set the
|
||||||
|
GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
|
||||||
|
"tensorflow" in the training config, cupy will allocate memory via those
|
||||||
|
respective libraries, preventing OOM errors when there's available memory
|
||||||
|
sitting in the other library's pool.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[training]
|
||||||
|
gpu_allocator = "pytorch"
|
||||||
|
```
|
||||||
|
|
||||||
## Custom models with Thinc {#thinc}
|
## Custom models with Thinc {#thinc}
|
||||||
|
|
||||||
Of course it's also possible to define the `Model` from the previous section
|
Of course it's also possible to define the `Model` from the previous section
|
||||||
|
@ -477,7 +489,7 @@ with Model.define_operators({">>": chain}):
|
||||||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<!-- TODO:
|
<!-- TODO: write trainable component section
|
||||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
- Interaction with `predict`, `get_loss` and `set_annotations`
|
||||||
- Initialization life-cycle with `begin_training`, correlation with add_label
|
- Initialization life-cycle with `begin_training`, correlation with add_label
|
||||||
Example: relation extraction component (implemented as project template)
|
Example: relation extraction component (implemented as project template)
|
||||||
|
|
|
@ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on
|
||||||
native Python packaging. This allows your application to handle a spaCy pipeline
|
native Python packaging. This allows your application to handle a spaCy pipeline
|
||||||
like any other package dependency.
|
like any other package dependency.
|
||||||
|
|
||||||
<!-- TODO: reference relevant spaCy project -->
|
|
||||||
|
|
||||||
### Downloading and requiring package dependencies {#models-download}
|
### Downloading and requiring package dependencies {#models-download}
|
||||||
|
|
||||||
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
||||||
|
|
|
@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
<!-- TODO:
|
<Project id="pipelines/tagger_parser_ud">
|
||||||
<Project id="some_example_project">
|
|
||||||
|
|
||||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
The easiest way to get started is to clone a project template and run it – for
|
||||||
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
example, this end-to-end template that lets you train a **part-of-speech
|
||||||
mattis pretium.
|
tagger** and **dependency parser** on a Universal Dependencies treebank.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
-->
|
|
||||||
|
|
||||||
spaCy projects make it easy to integrate with many other **awesome tools** in
|
spaCy projects make it easy to integrate with many other **awesome tools** in
|
||||||
the data science and machine learning ecosystem to track and manage your data
|
the data science and machine learning ecosystem to track and manage your data
|
||||||
|
@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the
|
||||||
project, e.g. to train a pipeline and edit the commands and scripts to build
|
project, e.g. to train a pipeline and edit the commands and scripts to build
|
||||||
fully custom workflows.
|
fully custom workflows.
|
||||||
|
|
||||||
<!-- TODO: update with real example project -->
|
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
python -m spacy project clone some_example_project
|
python -m spacy project clone pipelines/tagger_parser_ud
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, the project will be cloned into the current working directory. You
|
By default, the project will be cloned into the current working directory. You
|
||||||
|
@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
|
||||||
a quick web demo. It looks pretty similar to a config file used to define CI
|
a quick web demo. It looks pretty similar to a config file used to define CI
|
||||||
pipelines.
|
pipelines.
|
||||||
|
|
||||||
<!-- TODO: update with better (final) example -->
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml
|
https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
|
@ -976,14 +970,12 @@ your results.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
<!-- TODO:
|
|
||||||
|
|
||||||
<Project id="integrations/wandb">
|
<Project id="integrations/wandb">
|
||||||
|
|
||||||
Get started with tracking your spaCy training runs in Weights & Biases using our
|
Get started with tracking your spaCy training runs in Weights & Biases using our
|
||||||
project template. It includes a simple config using the `WandbLogger`, as well
|
project template. It trains on the IMDB Movie Review Dataset and includes a
|
||||||
as a custom logger implementation you can adjust for your specific use case.
|
simple config with the built-in `WandbLogger`, as well as a custom example of
|
||||||
|
creating variants of the config for a simple hyperparameter grid search and
|
||||||
|
logging the results.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
-->
|
|
||||||
|
|
|
@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data,
|
||||||
meta and configuration will be written out. To make the pipeline more convenient
|
meta and configuration will be written out. To make the pipeline more convenient
|
||||||
to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
|
to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
|
||||||
|
|
||||||
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
|
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced>
|
||||||
|
|
||||||
When you save a pipeline in spaCy v3.0+, two files will be exported: a
|
When you save a pipeline in spaCy v3.0+, two files will be exported: a
|
||||||
[`config.cfg`](/api/data-formats#config) based on
|
[`config.cfg`](/api/data-formats#config) based on
|
||||||
|
@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta).
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
<Project id="pipelines/tagger_parser_ud">
|
||||||
|
|
||||||
|
The easiest way to get started with an end-to-end workflow is to clone a
|
||||||
|
[project template](/usage/projects) and run it – for example, this template that
|
||||||
|
lets you train a **part-of-speech tagger** and **dependency parser** on a
|
||||||
|
Universal Dependencies treebank and generates an installable Python package.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
### Generating a pipeline package {#models-generating}
|
### Generating a pipeline package {#models-generating}
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead.
|
||||||
```python
|
```python
|
||||||
nlp = spacy.blank("en").from_disk("/path/to/data")
|
nlp = spacy.blank("en").from_disk("/path/to/data")
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- TODO: point to spaCy projects? -->
|
|
||||||
|
|
|
@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the
|
||||||
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
<Accordion title="How are the config recommendations generated?" id="quickstart-source">
|
<Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced>
|
||||||
|
|
||||||
The recommended config settings generated by the quickstart widget and the
|
The recommended config settings generated by the quickstart widget and the
|
||||||
[`init config`](/api/cli#init-config) command are based on some general **best
|
[`init config`](/api/cli#init-config) command are based on some general **best
|
||||||
|
@ -112,6 +112,15 @@ as we run more experiments.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
<Project id="pipelines/tagger_parser_ud">
|
||||||
|
|
||||||
|
The easiest way to get started is to clone a [project template](/usage/projects)
|
||||||
|
and run it – for example, this end-to-end template that lets you train a
|
||||||
|
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
|
||||||
|
treebank.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
## Training config {#config}
|
## Training config {#config}
|
||||||
|
|
||||||
Training config files include all **settings and hyperparameters** for training
|
Training config files include all **settings and hyperparameters** for training
|
||||||
|
|
|
@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model.
|
||||||
|
|
||||||
### Manage end-to-end workflows with projects {#features-projects}
|
### Manage end-to-end workflows with projects {#features-projects}
|
||||||
|
|
||||||
<!-- TODO: update example -->
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> # Clone a project template
|
> # Clone a project template
|
||||||
> $ python -m spacy project clone example
|
> $ python -m spacy project clone pipelines/tagger_parser_ud
|
||||||
> $ cd example
|
> $ cd tagger_parser_ud
|
||||||
> # Download data assets
|
> # Download data assets
|
||||||
> $ python -m spacy project assets
|
> $ python -m spacy project assets
|
||||||
> # Run a workflow
|
> # Run a workflow
|
||||||
> $ python -m spacy project run train
|
> $ python -m spacy project run all
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||||
|
@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
|
||||||
[Ray](/usage/projects#ray) for parallel training,
|
[Ray](/usage/projects#ray) for parallel training,
|
||||||
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
|
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
|
||||||
|
|
||||||
<!-- <Project id="some_example_project">
|
|
||||||
|
|
||||||
The easiest way to get started with an end-to-end training process is to clone a
|
|
||||||
[project](/usage/projects) template. Projects let you manage multi-step
|
|
||||||
workflows, from data preprocessing to training and packaging your pipeline.
|
|
||||||
|
|
||||||
</Project>-->
|
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:** [spaCy projects](/usage/projects),
|
- **Usage:** [spaCy projects](/usage/projects),
|
||||||
|
@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
<Project id="pipelines/tagger_parser_ud">
|
||||||
|
|
||||||
|
The easiest way to get started is to clone a [project template](/usage/projects)
|
||||||
|
and run it – for example, this end-to-end template that lets you train a
|
||||||
|
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
|
||||||
|
treebank.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
### Parallel and distributed training with Ray {#features-parallel-training}
|
### Parallel and distributed training with Ray {#features-parallel-training}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training:
|
||||||
+ python -m spacy train ./config.cfg --output ./output
|
+ python -m spacy train ./config.cfg --output ./output
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- TODO: project template -->
|
<Project id="pipelines/tagger_parser_ud">
|
||||||
|
|
||||||
|
The easiest way to get started is to clone a [project template](/usage/projects)
|
||||||
|
and run it – for example, this end-to-end template that lets you train a
|
||||||
|
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
|
||||||
|
treebank.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
#### Training via the Python API {#migrating-training-python}
|
#### Training via the Python API {#migrating-training-python}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
"companyUrl": "https://explosion.ai",
|
"companyUrl": "https://explosion.ai",
|
||||||
"repo": "explosion/spaCy",
|
"repo": "explosion/spaCy",
|
||||||
"modelsRepo": "explosion/spacy-models",
|
"modelsRepo": "explosion/spacy-models",
|
||||||
|
"projectsRepo": "explosion/projects/tree/v3",
|
||||||
"social": {
|
"social": {
|
||||||
"twitter": "spacy_io",
|
"twitter": "spacy_io",
|
||||||
"github": "explosion"
|
"github": "explosion"
|
||||||
|
|
|
@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
|
||||||
const isValid = isString(children) && !isNaN(children)
|
const isValid = isString(children) && !isNaN(children)
|
||||||
const version = isValid ? Number(children).toFixed(1) : children
|
const version = isValid ? Number(children).toFixed(1) : children
|
||||||
const tooltipText = `This feature is new and was introduced in spaCy v${version}`
|
const tooltipText = `This feature is new and was introduced in spaCy v${version}`
|
||||||
// TODO: we probably want to handle this more elegantly, but the idea is
|
// We probably want to handle this more elegantly, but the idea is
|
||||||
// that we can hide tags referring to old versions
|
// that we can hide tags referring to old versions
|
||||||
const major = isString(version) ? Number(version.split('.')[0]) : version
|
const major = isString(version) ? Number(version.split('.')[0]) : version
|
||||||
return major < MIN_VERSION ? null : (
|
return major < MIN_VERSION ? null : (
|
||||||
|
|
|
@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser()
|
||||||
const DEFAULT_BRANCH = 'develop'
|
const DEFAULT_BRANCH = 'develop'
|
||||||
export const repo = siteMetadata.repo
|
export const repo = siteMetadata.repo
|
||||||
export const modelsRepo = siteMetadata.modelsRepo
|
export const modelsRepo = siteMetadata.modelsRepo
|
||||||
|
export const projectsRepo = siteMetadata.projectsRepo
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is used to provide selectors for headings so they can be crawled by
|
* This is used to provide selectors for headings so they can be crawled by
|
||||||
|
|
|
@ -15,6 +15,10 @@
|
||||||
background: transparent
|
background: transparent
|
||||||
resize: none
|
resize: none
|
||||||
font: inherit
|
font: inherit
|
||||||
|
overflow: hidden
|
||||||
|
white-space: nowrap
|
||||||
|
text-overflow: ellipsis
|
||||||
|
margin-right: 1rem
|
||||||
|
|
||||||
.prefix
|
.prefix
|
||||||
margin-right: 0.75em
|
margin-right: 0.75em
|
||||||
|
|
|
@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md'
|
||||||
|
|
||||||
const CODE_EXAMPLE = `# pip install spacy
|
const CODE_EXAMPLE = `# pip install spacy
|
||||||
# python -m spacy download en_core_web_sm
|
# python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
# Load English tokenizer, tagger, parser and NER
|
# Load English tokenizer, tagger, parser and NER
|
||||||
|
@ -120,7 +119,7 @@ const Landing = ({ data }) => {
|
||||||
</Li>
|
</Li>
|
||||||
<Li>
|
<Li>
|
||||||
✅ Components for <strong>named entity</strong> recognition,
|
✅ Components for <strong>named entity</strong> recognition,
|
||||||
part-of-speech-tagging, dependency parsing, sentence segmentation,{' '}
|
part-of-speech tagging, dependency parsing, sentence segmentation,{' '}
|
||||||
<strong>text classification</strong>, lemmatization, morphological
|
<strong>text classification</strong>, lemmatization, morphological
|
||||||
analysis, entity linking and more
|
analysis, entity linking and more
|
||||||
</Li>
|
</Li>
|
||||||
|
@ -223,10 +222,11 @@ const Landing = ({ data }) => {
|
||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
{/** TODO: update with actual example */}
|
<Project id="pipelines/tagger_parser_ud" title="Get started">
|
||||||
<Project id="some_example">
|
The easiest way to get started is to clone a project template and run it
|
||||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
– for example, this template for training a{' '}
|
||||||
sodales lectus.
|
<strong>part-of-speech tagger</strong> and{' '}
|
||||||
|
<strong>dependency parser</strong> on a Universal Dependencies treebank.
|
||||||
</Project>
|
</Project>
|
||||||
</LandingCol>
|
</LandingCol>
|
||||||
<LandingCol>
|
<LandingCol>
|
||||||
|
|
|
@ -4,25 +4,29 @@ import CopyInput from '../components/copy'
|
||||||
import Infobox from '../components/infobox'
|
import Infobox from '../components/infobox'
|
||||||
import Link from '../components/link'
|
import Link from '../components/link'
|
||||||
import { InlineCode } from '../components/code'
|
import { InlineCode } from '../components/code'
|
||||||
|
import { projectsRepo } from '../components/util'
|
||||||
|
|
||||||
// TODO: move to meta?
|
|
||||||
const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3'
|
|
||||||
const COMMAND = 'python -m spacy project clone'
|
const COMMAND = 'python -m spacy project clone'
|
||||||
|
|
||||||
export default function Project({ id, repo, children }) {
|
export default function Project({
|
||||||
|
title = 'Get started with a project template',
|
||||||
|
id,
|
||||||
|
repo,
|
||||||
|
children,
|
||||||
|
}) {
|
||||||
const repoArg = repo ? ` --repo ${repo}` : ''
|
const repoArg = repo ? ` --repo ${repo}` : ''
|
||||||
const text = `${COMMAND} ${id}${repoArg}`
|
const text = `${COMMAND} ${id}${repoArg}`
|
||||||
const url = `${repo || DEFAULT_REPO}/${id}`
|
const url = `${repo || projectsRepo}/${id}`
|
||||||
const title = (
|
const header = (
|
||||||
<>
|
<>
|
||||||
Get started with a project template:{' '}
|
{title}:{' '}
|
||||||
<Link to={url}>
|
<Link to={url}>
|
||||||
<InlineCode>{id}</InlineCode>
|
<InlineCode>{id}</InlineCode>
|
||||||
</Link>
|
</Link>
|
||||||
</>
|
</>
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
<Infobox title={title} emoji="🪐">
|
<Infobox title={header} emoji="🪐">
|
||||||
{children}
|
{children}
|
||||||
<CopyInput text={text} prefix="$" />
|
<CopyInput text={text} prefix="$" />
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user