mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-21 18:34:14 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
e22de2e69d
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a33,<8.0.0a40",
|
||||
"thinc>=8.0.0a34,<8.0.0a40",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a33,<8.0.0a40
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a33,<8.0.0a40
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a33,<8.0.0a40
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
|
|
@ -308,6 +308,31 @@ def git_checkout(
|
|||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
raise IOError("Parent of destination of checkout must exist")
|
||||
|
||||
if sparse and git_version >= (2, 22):
|
||||
return git_sparse_checkout(repo, subpath, dest, branch)
|
||||
elif sparse:
|
||||
# Only show warnings if the user explicitly wants sparse checkout but
|
||||
# the Git version doesn't support it
|
||||
err_old = (
|
||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||
f"that doesn't fully support sparse checkout yet."
|
||||
)
|
||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||
msg.warn(
|
||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||
f"This means that more files than necessary may be downloaded "
|
||||
f"temporarily. To only download the files needed, make sure "
|
||||
f"you're using Git v2.22 or above."
|
||||
)
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
||||
def git_sparse_checkout(repo, subpath, dest, branch):
|
||||
# We're using Git, partial clone and sparse checkout to
|
||||
# only clone the files we need
|
||||
# This ends up being RIDICULOUS. omg.
|
||||
|
@ -324,47 +349,31 @@ def git_checkout(
|
|||
# *that* we can do by path.
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
supports_sparse = git_version >= (2, 22)
|
||||
use_sparse = supports_sparse and sparse
|
||||
# This is the "clone, but don't download anything" part.
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
||||
if use_sparse:
|
||||
cmd += f"--filter=blob:none" # <-- The key bit
|
||||
# Only show warnings if the user explicitly wants sparse checkout but
|
||||
# the Git version doesn't support it
|
||||
elif sparse:
|
||||
err_old = (
|
||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||
f"that doesn't fully support sparse checkout yet."
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
f"-b {branch} --filter=blob:none"
|
||||
)
|
||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||
msg.warn(
|
||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||
f"This means that more files than necessary may be downloaded "
|
||||
f"temporarily. To only download the files needed, make sure "
|
||||
f"you're using Git v2.22 or above."
|
||||
)
|
||||
try_run_command(cmd)
|
||||
run_command(cmd)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
|
||||
ret = try_run_command(cmd)
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
git_repo = _from_http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
if use_sparse and not missings:
|
||||
if not missings:
|
||||
err = (
|
||||
f"Could not find any relevant files for '{subpath}'. "
|
||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||
f"and branch {branch}?"
|
||||
)
|
||||
msg.fail(err, exits=1)
|
||||
if use_sparse:
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||
try_run_command(cmd)
|
||||
run_command(cmd, capture=True)
|
||||
# And finally, we can checkout our subpath
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
try_run_command(cmd)
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
@ -378,7 +387,7 @@ def get_git_version(
|
|||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||
(0, 0) if the version couldn't be determined.
|
||||
"""
|
||||
ret = try_run_command(["git", "--version"], error=error)
|
||||
ret = run_command("git --version", capture=True)
|
||||
stdout = ret.stdout.strip()
|
||||
if not stdout or not stdout.startswith("git version"):
|
||||
return (0, 0)
|
||||
|
@ -386,23 +395,6 @@ def get_git_version(
|
|||
return (int(version[0]), int(version[1]))
|
||||
|
||||
|
||||
def try_run_command(
|
||||
cmd: Union[str, List[str]], error: str = "Could not run command"
|
||||
) -> subprocess.CompletedProcess:
|
||||
"""Try running a command and raise an error if it fails.
|
||||
|
||||
cmd (Union[str, List[str]]): The command to run.
|
||||
error (str): The error message.
|
||||
RETURNS (CompletedProcess): The completed process if the command ran.
|
||||
"""
|
||||
try:
|
||||
return run_command(cmd, capture=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
msg.fail(error)
|
||||
print(cmd)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _from_http_to_git(repo: str) -> str:
|
||||
if repo.startswith("http://"):
|
||||
repo = repo.replace(r"http://", r"https://")
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Dict, Any, Optional
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import Model, data_validation
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
|
@ -53,7 +53,12 @@ def debug_model_cli(
|
|||
}
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=True
|
||||
)
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
nlp, config = util.load_model_from_config(config_path)
|
||||
seed = config["training"]["seed"]
|
||||
if seed is not None:
|
||||
|
|
|
@ -110,7 +110,7 @@ def package(
|
|||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
if create_sdist:
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "sdist"])
|
||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||
|
||||
|
|
|
@ -4,10 +4,9 @@ import time
|
|||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
from thinc.api import require_gpu, set_gpu_allocator
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
from thinc.api import Config, CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
from functools import partial
|
||||
|
@ -32,7 +31,7 @@ def pretrain_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
|
@ -99,10 +98,12 @@ def pretrain(
|
|||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
):
|
||||
if config["system"].get("seed") is not None:
|
||||
fix_random_seed(config["system"]["seed"])
|
||||
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
|
||||
use_pytorch_for_gpu_memory()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
P_cfg = config["pretraining"]
|
||||
corpus = dot_to_object(config, P_cfg["corpus"])
|
||||
|
|
|
@ -59,8 +59,9 @@ def project_run(
|
|||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, **err_kwargs)
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
rerun = check_rerun(current_dir, cmd)
|
||||
if not rerun and not force:
|
||||
|
@ -144,7 +145,7 @@ def run_commands(
|
|||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command)
|
||||
run_command(command, capture=False)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
|
|
|
@ -8,7 +8,11 @@ train = ""
|
|||
dev = ""
|
||||
|
||||
[system]
|
||||
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
|
||||
{% if use_transformer -%}
|
||||
gpu_allocator = "pytorch"
|
||||
{% else -%}
|
||||
gpu_allocator = null
|
||||
{% endif %}
|
||||
|
||||
[nlp]
|
||||
lang = "{{ lang }}"
|
||||
|
|
|
@ -6,8 +6,7 @@ from pathlib import Path
|
|||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
||||
from thinc.api import Config, Optimizer
|
||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
||||
import random
|
||||
import typer
|
||||
import logging
|
||||
|
@ -29,7 +28,7 @@ def train_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
||||
|
@ -79,15 +78,16 @@ def train(
|
|||
config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=True
|
||||
)
|
||||
if config.get("training", {}).get("seed") is not None:
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error(config_path):
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
||||
if config["training"]["vectors"] is not None:
|
||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||
|
|
|
@ -6,13 +6,12 @@ init_tok2vec = null
|
|||
|
||||
[system]
|
||||
seed = 0
|
||||
use_pytorch_for_gpu_memory = false
|
||||
gpu_allocator = null
|
||||
|
||||
[nlp]
|
||||
lang = null
|
||||
pipeline = []
|
||||
disabled = []
|
||||
load_vocab_data = true
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
|
@ -52,12 +51,14 @@ limit = 0
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
seed = ${system.seed}
|
||||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
lookups = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
|
@ -75,7 +76,6 @@ train_corpus = "corpora.train"
|
|||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
|
|
|
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
|
|||
from .git_info import GIT_VERSION
|
||||
from . import util
|
||||
from . import about
|
||||
from .lookups import load_lookups
|
||||
|
||||
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
@ -152,7 +160,6 @@ class Language:
|
|||
self.lang,
|
||||
self.Defaults,
|
||||
vectors_name=vectors_name,
|
||||
load_data=self._config["nlp"]["load_vocab_data"],
|
||||
)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
|
|
|
@ -8,6 +8,7 @@ from collections import defaultdict
|
|||
from thinc.api import Optimizer
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
|
|||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
|
@ -207,6 +209,7 @@ class ConfigSchemaTraining(BaseModel):
|
|||
max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
|
||||
eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
|
||||
seed: Optional[StrictInt] = Field(..., title="Random seed")
|
||||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
|
@ -227,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
|
|||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
||||
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
|
||||
|
|
|
@ -69,7 +69,6 @@ def test_util_dot_section():
|
|||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["textcat"]
|
||||
load_vocab_data = false
|
||||
|
||||
[components]
|
||||
|
||||
|
@ -95,15 +94,13 @@ def test_util_dot_section():
|
|||
# not exclusive_classes
|
||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||
# Test that default values got overwritten
|
||||
assert not en_config["nlp"]["load_vocab_data"]
|
||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
||||
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.unknownattribute")
|
||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||
|
||||
|
||||
|
|
|
@ -253,6 +253,14 @@ def load_vectors_into_model(
|
|||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||
|
||||
|
||||
def load_vocab_data_into_model(
|
||||
nlp: "Language", *, lookups: Optional["Lookups"] = None
|
||||
) -> None:
|
||||
"""Load vocab data."""
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
|
||||
|
||||
def load_model(
|
||||
name: Union[str, Path],
|
||||
*,
|
||||
|
@ -651,8 +659,8 @@ def join_command(command: List[str]) -> str:
|
|||
def run_command(
|
||||
command: Union[str, List[str]],
|
||||
*,
|
||||
capture: bool = False,
|
||||
stdin: Optional[Any] = None,
|
||||
capture: bool = False,
|
||||
) -> Optional[subprocess.CompletedProcess]:
|
||||
"""Run a command on the command line as a subprocess. If the subprocess
|
||||
returns a non-zero exit code, a system exit is performed.
|
||||
|
@ -660,33 +668,46 @@ def run_command(
|
|||
command (str / List[str]): The command. If provided as a string, the
|
||||
string will be split using shlex.split.
|
||||
stdin (Optional[Any]): stdin to read from or None.
|
||||
capture (bool): Whether to capture the output.
|
||||
capture (bool): Whether to capture the output and errors. If False,
|
||||
the stdout and stderr will not be redirected, and if there's an error,
|
||||
sys.exit will be called with the returncode. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
RETURNS (Optional[CompletedProcess]): The process object.
|
||||
"""
|
||||
if isinstance(command, str):
|
||||
command = split_command(command)
|
||||
cmd_list = split_command(command)
|
||||
cmd_str = command
|
||||
else:
|
||||
cmd_list = command
|
||||
cmd_str = " ".join(command)
|
||||
try:
|
||||
ret = subprocess.run(
|
||||
command,
|
||||
cmd_list,
|
||||
env=os.environ.copy(),
|
||||
input=stdin,
|
||||
encoding="utf8",
|
||||
check=True,
|
||||
check=False,
|
||||
stdout=subprocess.PIPE if capture else None,
|
||||
stderr=subprocess.PIPE if capture else None,
|
||||
stderr=subprocess.STDOUT if capture else None,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
# Indicates the *command* wasn't found, it's an error before the command
|
||||
# is run.
|
||||
raise FileNotFoundError(
|
||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||
Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
|
||||
) from None
|
||||
except subprocess.CalledProcessError as e:
|
||||
# We don't want a duplicate traceback here so we're making sure the
|
||||
# CalledProcessError isn't re-raised. We also print both the string
|
||||
# message and the stderr, in case the error only has one of them.
|
||||
print(e.stderr)
|
||||
print(e)
|
||||
sys.exit(1)
|
||||
if ret.returncode != 0:
|
||||
if ret.returncode != 0 and capture:
|
||||
message = f"Error running command:\n\n{cmd_str}\n\n"
|
||||
message += f"Subprocess exited with status {ret.returncode}"
|
||||
if ret.stdout is not None:
|
||||
message += f"\n\nProcess log (stdout and stderr):\n\n"
|
||||
message += ret.stdout
|
||||
error = subprocess.SubprocessError(message)
|
||||
error.ret = ret
|
||||
error.command = cmd_str
|
||||
raise error
|
||||
elif ret.returncode != 0:
|
||||
sys.exit(ret.returncode)
|
||||
return ret
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ cdef class Vocab:
|
|||
cpdef readonly StringStore strings
|
||||
cpdef public Morphology morphology
|
||||
cpdef public object vectors
|
||||
cpdef public object lookups
|
||||
cpdef public object _lookups
|
||||
cpdef public object writing_system
|
||||
cpdef public object get_noun_chunks
|
||||
cdef readonly int length
|
||||
|
|
|
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
||||
def create_vocab(lang, defaults, vectors_name=None):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
if load_data:
|
||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
||||
else:
|
||||
lookups = Lookups()
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
# This is messy, but it's the minimal working fix to Issue #639.
|
||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||
|
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
|||
lex_attrs[NORM] = util.add_lookups(
|
||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
lookups.get_table("lexeme_norm", {}),
|
||||
)
|
||||
return Vocab(
|
||||
lex_attr_getters=lex_attrs,
|
||||
lookups=lookups,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
vectors_name=vectors_name,
|
||||
|
@ -424,6 +417,19 @@ cdef class Vocab:
|
|||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
||||
property lookups:
|
||||
def __get__(self):
|
||||
return self._lookups
|
||||
|
||||
def __set__(self, lookups):
|
||||
self._lookups = lookups
|
||||
if lookups.has_table("lexeme_norm"):
|
||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
||||
self.lookups.get_table("lexeme_norm"),
|
||||
)
|
||||
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
|
@ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
|||
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||
|
@ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
|
|||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
@ -893,8 +895,6 @@ what you need. By default, spaCy's
|
|||
can provide any other repo (public or private) that you have access to using the
|
||||
`--repo` option.
|
||||
|
||||
<!-- TODO: update example once we've decided on repo structure -->
|
||||
|
||||
```cli
|
||||
$ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
|
||||
```
|
||||
|
@ -902,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
|
|||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy project clone some_example
|
||||
> $ python -m spacy project clone pipelines/ner_wikiner
|
||||
> ```
|
||||
>
|
||||
> Clone from custom repo:
|
||||
|
|
|
@ -60,7 +60,6 @@ your config and check that it's valid, you can run the
|
|||
> [nlp]
|
||||
> lang = "en"
|
||||
> pipeline = ["tagger", "parser", "ner"]
|
||||
> load_vocab_data = true
|
||||
> before_creation = null
|
||||
> after_creation = null
|
||||
> after_pipeline_creation = null
|
||||
|
@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and
|
|||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
|
@ -190,7 +188,9 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
|||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
||||
| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
|
||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
|
|
|
@ -146,7 +146,8 @@ pipelines.
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------- |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||
| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
|
||||
|
||||
### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
|
||||
|
@ -165,7 +166,8 @@ and _before_ loading any pipelines.
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------- |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
|
||||
| **RETURNS** | `True` ~~bool~~ |
|
||||
|
||||
## displaCy {#displacy source="spacy/displacy"}
|
||||
|
@ -456,6 +458,16 @@ remain in the config file stored on your local system.
|
|||
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
||||
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
||||
|
||||
<Project id="integrations/wandb">
|
||||
|
||||
Get started with tracking your spaCy training runs in Weights & Biases using our
|
||||
project template. It trains on the IMDB Movie Review Dataset and includes a
|
||||
simple config with the built-in `WandbLogger`, as well as a custom example of
|
||||
creating variants of the config for a simple hyperparameter grid search and
|
||||
logging the results.
|
||||
|
||||
</Project>
|
||||
|
||||
## Readers {#readers source="spacy/training/corpus.py" new="3"}
|
||||
|
||||
Corpus readers are registered functions that load data and return a function
|
||||
|
|
|
@ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register
|
|||
yourself. For details on how to get started with training your own model, check
|
||||
out the [training quickstart](/usage/training#quickstart).
|
||||
|
||||
<!-- TODO:
|
||||
<Project id="en_core_trf_lg">
|
||||
<!-- TODO: <Project id="en_core_trf_lg">
|
||||
|
||||
The easiest way to get started is to clone a transformers-based project
|
||||
template. Swap in your data, edit the settings and hyperparameters and train,
|
||||
|
@ -623,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
|
|||
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
|
||||
expect the same types of objects, although for pretraining your corpus does not
|
||||
need to have any annotations, so you will often use a different reader, such as
|
||||
the [`JsonlReader`](/api/toplevel#jsonlreader).
|
||||
the [`JsonlReader`](/api/top-level#jsonlreader).
|
||||
|
||||
> #### Raw text format
|
||||
>
|
||||
|
@ -655,6 +654,16 @@ and pass in optional config overrides, like the path to the raw text file:
|
|||
$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
|
||||
```
|
||||
|
||||
The following defaults are used for the `[pretraining]` block and merged into
|
||||
your existing config when you run [`init config`](/api/cli#init-config) or
|
||||
[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
|
||||
you can [configure](#pretraining-configure) the settings and hyperparameters or
|
||||
change the [objective](#pretraining-details).
|
||||
|
||||
```ini
|
||||
%%GITHUB_SPACY/spacy/default_config_pretraining.cfg
|
||||
```
|
||||
|
||||
### How pretraining works {#pretraining-details}
|
||||
|
||||
The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
|
||||
|
|
|
@ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy
|
|||
right up to **current state-of-the-art**. You can also use a CPU-optimized
|
||||
pipeline, which is less accurate but much cheaper to run.
|
||||
|
||||
<!-- TODO: -->
|
||||
<!-- TODO: update benchmarks and intro -->
|
||||
|
||||
> #### Evaluation details
|
||||
>
|
||||
|
@ -68,6 +68,6 @@ our project template.
|
|||
|
||||
</Project>
|
||||
|
||||
<!-- ## Citing spaCy {#citation}
|
||||
<!-- TODO: ## Citing spaCy {#citation}
|
||||
|
||||
<!-- TODO: update -->
|
||||
-->
|
||||
|
|
|
@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible.
|
|||
|
||||
</Infobox>
|
||||
|
||||
Note that when using a PyTorch or Tensorflow model, it is recommended to set the
|
||||
GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
|
||||
"tensorflow" in the training config, cupy will allocate memory via those
|
||||
respective libraries, preventing OOM errors when there's available memory
|
||||
sitting in the other library's pool.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
[training]
|
||||
gpu_allocator = "pytorch"
|
||||
```
|
||||
|
||||
## Custom models with Thinc {#thinc}
|
||||
|
||||
Of course it's also possible to define the `Model` from the previous section
|
||||
|
@ -477,7 +489,7 @@ with Model.define_operators({">>": chain}):
|
|||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO:
|
||||
<!-- TODO: write trainable component section
|
||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
||||
- Initialization life-cycle with `begin_training`, correlation with add_label
|
||||
Example: relation extraction component (implemented as project template)
|
||||
|
|
|
@ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on
|
|||
native Python packaging. This allows your application to handle a spaCy pipeline
|
||||
like any other package dependency.
|
||||
|
||||
<!-- TODO: reference relevant spaCy project -->
|
||||
|
||||
### Downloading and requiring package dependencies {#models-download}
|
||||
|
||||
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
|
||||
|
|
|
@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new
|
|||
|
||||

|
||||
|
||||
<!-- TODO:
|
||||
<Project id="some_example_project">
|
||||
<Project id="pipelines/tagger_parser_ud">
|
||||
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
|
||||
mattis pretium.
|
||||
The easiest way to get started is to clone a project template and run it – for
|
||||
example, this end-to-end template that lets you train a **part-of-speech
|
||||
tagger** and **dependency parser** on a Universal Dependencies treebank.
|
||||
|
||||
</Project>
|
||||
-->
|
||||
|
||||
spaCy projects make it easy to integrate with many other **awesome tools** in
|
||||
the data science and machine learning ecosystem to track and manage your data
|
||||
|
@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the
|
|||
project, e.g. to train a pipeline and edit the commands and scripts to build
|
||||
fully custom workflows.
|
||||
|
||||
<!-- TODO: update with real example project -->
|
||||
|
||||
```cli
|
||||
python -m spacy project clone some_example_project
|
||||
python -m spacy project clone pipelines/tagger_parser_ud
|
||||
```
|
||||
|
||||
By default, the project will be cloned into the current working directory. You
|
||||
|
@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
|
|||
a quick web demo. It looks pretty similar to a config file used to define CI
|
||||
pipelines.
|
||||
|
||||
<!-- TODO: update with better (final) example -->
|
||||
|
||||
```yaml
|
||||
https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml
|
||||
https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
|
||||
```
|
||||
|
||||
| Section | Description |
|
||||
|
@ -976,14 +970,12 @@ your results.
|
|||
|
||||

|
||||
|
||||
<!-- TODO:
|
||||
|
||||
<Project id="integrations/wandb">
|
||||
|
||||
Get started with tracking your spaCy training runs in Weights & Biases using our
|
||||
project template. It includes a simple config using the `WandbLogger`, as well
|
||||
as a custom logger implementation you can adjust for your specific use case.
|
||||
project template. It trains on the IMDB Movie Review Dataset and includes a
|
||||
simple config with the built-in `WandbLogger`, as well as a custom example of
|
||||
creating variants of the config for a simple hyperparameter grid search and
|
||||
logging the results.
|
||||
|
||||
</Project>
|
||||
|
||||
-->
|
||||
|
|
|
@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data,
|
|||
meta and configuration will be written out. To make the pipeline more convenient
|
||||
to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
|
||||
|
||||
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
|
||||
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced>
|
||||
|
||||
When you save a pipeline in spaCy v3.0+, two files will be exported: a
|
||||
[`config.cfg`](/api/data-formats#config) based on
|
||||
|
@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta).
|
|||
|
||||
</Accordion>
|
||||
|
||||
<Project id="pipelines/tagger_parser_ud">
|
||||
|
||||
The easiest way to get started with an end-to-end workflow is to clone a
|
||||
[project template](/usage/projects) and run it – for example, this template that
|
||||
lets you train a **part-of-speech tagger** and **dependency parser** on a
|
||||
Universal Dependencies treebank and generates an installable Python package.
|
||||
|
||||
</Project>
|
||||
|
||||
### Generating a pipeline package {#models-generating}
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead.
|
|||
```python
|
||||
nlp = spacy.blank("en").from_disk("/path/to/data")
|
||||
```
|
||||
|
||||
<!-- TODO: point to spaCy projects? -->
|
||||
|
|
|
@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the
|
|||
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
||||
```
|
||||
|
||||
<Accordion title="How are the config recommendations generated?" id="quickstart-source">
|
||||
<Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced>
|
||||
|
||||
The recommended config settings generated by the quickstart widget and the
|
||||
[`init config`](/api/cli#init-config) command are based on some general **best
|
||||
|
@ -112,6 +112,15 @@ as we run more experiments.
|
|||
|
||||
</Accordion>
|
||||
|
||||
<Project id="pipelines/tagger_parser_ud">
|
||||
|
||||
The easiest way to get started is to clone a [project template](/usage/projects)
|
||||
and run it – for example, this end-to-end template that lets you train a
|
||||
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
|
||||
treebank.
|
||||
|
||||
</Project>
|
||||
|
||||
## Training config {#config}
|
||||
|
||||
Training config files include all **settings and hyperparameters** for training
|
||||
|
|
|
@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model.
|
|||
|
||||
### Manage end-to-end workflows with projects {#features-projects}
|
||||
|
||||
<!-- TODO: update example -->
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> # Clone a project template
|
||||
> $ python -m spacy project clone example
|
||||
> $ cd example
|
||||
> $ python -m spacy project clone pipelines/tagger_parser_ud
|
||||
> $ cd tagger_parser_ud
|
||||
> # Download data assets
|
||||
> $ python -m spacy project assets
|
||||
> # Run a workflow
|
||||
> $ python -m spacy project run train
|
||||
> $ python -m spacy project run all
|
||||
> ```
|
||||
|
||||
spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||
|
@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
|
|||
[Ray](/usage/projects#ray) for parallel training,
|
||||
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
|
||||
|
||||
<!-- <Project id="some_example_project">
|
||||
|
||||
The easiest way to get started with an end-to-end training process is to clone a
|
||||
[project](/usage/projects) template. Projects let you manage multi-step
|
||||
workflows, from data preprocessing to training and packaging your pipeline.
|
||||
|
||||
</Project>-->
|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
- **Usage:** [spaCy projects](/usage/projects),
|
||||
|
@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline.
|
|||
|
||||
</Infobox>
|
||||
|
||||
<Project id="pipelines/tagger_parser_ud">
|
||||
|
||||
The easiest way to get started is to clone a [project template](/usage/projects)
|
||||
and run it – for example, this end-to-end template that lets you train a
|
||||
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
|
||||
treebank.
|
||||
|
||||
</Project>
|
||||
|
||||
### Parallel and distributed training with Ray {#features-parallel-training}
|
||||
|
||||
> #### Example
|
||||
|
@ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training:
|
|||
+ python -m spacy train ./config.cfg --output ./output
|
||||
```
|
||||
|
||||
<!-- TODO: project template -->
|
||||
<Project id="pipelines/tagger_parser_ud">
|
||||
|
||||
The easiest way to get started is to clone a [project template](/usage/projects)
|
||||
and run it – for example, this end-to-end template that lets you train a
|
||||
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
|
||||
treebank.
|
||||
|
||||
</Project>
|
||||
|
||||
#### Training via the Python API {#migrating-training-python}
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
"companyUrl": "https://explosion.ai",
|
||||
"repo": "explosion/spaCy",
|
||||
"modelsRepo": "explosion/spacy-models",
|
||||
"projectsRepo": "explosion/projects/tree/v3",
|
||||
"social": {
|
||||
"twitter": "spacy_io",
|
||||
"github": "explosion"
|
||||
|
|
|
@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
|
|||
const isValid = isString(children) && !isNaN(children)
|
||||
const version = isValid ? Number(children).toFixed(1) : children
|
||||
const tooltipText = `This feature is new and was introduced in spaCy v${version}`
|
||||
// TODO: we probably want to handle this more elegantly, but the idea is
|
||||
// We probably want to handle this more elegantly, but the idea is
|
||||
// that we can hide tags referring to old versions
|
||||
const major = isString(version) ? Number(version.split('.')[0]) : version
|
||||
return major < MIN_VERSION ? null : (
|
||||
|
|
|
@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser()
|
|||
const DEFAULT_BRANCH = 'develop'
|
||||
export const repo = siteMetadata.repo
|
||||
export const modelsRepo = siteMetadata.modelsRepo
|
||||
export const projectsRepo = siteMetadata.projectsRepo
|
||||
|
||||
/**
|
||||
* This is used to provide selectors for headings so they can be crawled by
|
||||
|
|
|
@ -15,6 +15,10 @@
|
|||
background: transparent
|
||||
resize: none
|
||||
font: inherit
|
||||
overflow: hidden
|
||||
white-space: nowrap
|
||||
text-overflow: ellipsis
|
||||
margin-right: 1rem
|
||||
|
||||
.prefix
|
||||
margin-right: 0.75em
|
||||
|
|
|
@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md'
|
|||
|
||||
const CODE_EXAMPLE = `# pip install spacy
|
||||
# python -m spacy download en_core_web_sm
|
||||
|
||||
import spacy
|
||||
|
||||
# Load English tokenizer, tagger, parser and NER
|
||||
|
@ -120,7 +119,7 @@ const Landing = ({ data }) => {
|
|||
</Li>
|
||||
<Li>
|
||||
✅ Components for <strong>named entity</strong> recognition,
|
||||
part-of-speech-tagging, dependency parsing, sentence segmentation,{' '}
|
||||
part-of-speech tagging, dependency parsing, sentence segmentation,{' '}
|
||||
<strong>text classification</strong>, lemmatization, morphological
|
||||
analysis, entity linking and more
|
||||
</Li>
|
||||
|
@ -223,10 +222,11 @@ const Landing = ({ data }) => {
|
|||
<br />
|
||||
<br />
|
||||
<br />
|
||||
{/** TODO: update with actual example */}
|
||||
<Project id="some_example">
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||
sodales lectus.
|
||||
<Project id="pipelines/tagger_parser_ud" title="Get started">
|
||||
The easiest way to get started is to clone a project template and run it
|
||||
– for example, this template for training a{' '}
|
||||
<strong>part-of-speech tagger</strong> and{' '}
|
||||
<strong>dependency parser</strong> on a Universal Dependencies treebank.
|
||||
</Project>
|
||||
</LandingCol>
|
||||
<LandingCol>
|
||||
|
|
|
@ -4,25 +4,29 @@ import CopyInput from '../components/copy'
|
|||
import Infobox from '../components/infobox'
|
||||
import Link from '../components/link'
|
||||
import { InlineCode } from '../components/code'
|
||||
import { projectsRepo } from '../components/util'
|
||||
|
||||
// TODO: move to meta?
|
||||
const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3'
|
||||
const COMMAND = 'python -m spacy project clone'
|
||||
|
||||
export default function Project({ id, repo, children }) {
|
||||
export default function Project({
|
||||
title = 'Get started with a project template',
|
||||
id,
|
||||
repo,
|
||||
children,
|
||||
}) {
|
||||
const repoArg = repo ? ` --repo ${repo}` : ''
|
||||
const text = `${COMMAND} ${id}${repoArg}`
|
||||
const url = `${repo || DEFAULT_REPO}/${id}`
|
||||
const title = (
|
||||
const url = `${repo || projectsRepo}/${id}`
|
||||
const header = (
|
||||
<>
|
||||
Get started with a project template:{' '}
|
||||
{title}:{' '}
|
||||
<Link to={url}>
|
||||
<InlineCode>{id}</InlineCode>
|
||||
</Link>
|
||||
</>
|
||||
)
|
||||
return (
|
||||
<Infobox title={title} emoji="🪐">
|
||||
<Infobox title={header} emoji="🪐">
|
||||
{children}
|
||||
<CopyInput text={text} prefix="$" />
|
||||
</Infobox>
|
||||
|
|
Loading…
Reference in New Issue
Block a user