mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						e22de2e69d
					
				|  | @ -6,7 +6,7 @@ requires = [ | |||
|     "cymem>=2.0.2,<2.1.0", | ||||
|     "preshed>=3.0.2,<3.1.0", | ||||
|     "murmurhash>=0.28.0,<1.1.0", | ||||
|     "thinc>=8.0.0a33,<8.0.0a40", | ||||
|     "thinc>=8.0.0a34,<8.0.0a40", | ||||
|     "blis>=0.4.0,<0.5.0", | ||||
|     "pytokenizations", | ||||
|     "pathy" | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| # Our libraries | ||||
| cymem>=2.0.2,<2.1.0 | ||||
| preshed>=3.0.2,<3.1.0 | ||||
| thinc>=8.0.0a33,<8.0.0a40 | ||||
| thinc>=8.0.0a34,<8.0.0a40 | ||||
| blis>=0.4.0,<0.5.0 | ||||
| ml_datasets==0.2.0a0 | ||||
| murmurhash>=0.28.0,<1.1.0 | ||||
|  |  | |||
|  | @ -34,13 +34,13 @@ setup_requires = | |||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     thinc>=8.0.0a33,<8.0.0a40 | ||||
|     thinc>=8.0.0a34,<8.0.0a40 | ||||
| install_requires = | ||||
|     # Our libraries | ||||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     thinc>=8.0.0a33,<8.0.0a40 | ||||
|     thinc>=8.0.0a34,<8.0.0a40 | ||||
|     blis>=0.4.0,<0.5.0 | ||||
|     wasabi>=0.8.0,<1.1.0 | ||||
|     srsly>=2.1.0,<3.0.0 | ||||
|  |  | |||
|  | @ -308,6 +308,31 @@ def git_checkout( | |||
|         msg.fail("Destination of checkout must not exist", exits=1) | ||||
|     if not dest.parent.exists(): | ||||
|         raise IOError("Parent of destination of checkout must exist") | ||||
| 
 | ||||
|     if sparse and git_version >= (2, 22): | ||||
|         return git_sparse_checkout(repo, subpath, dest, branch) | ||||
|     elif sparse: | ||||
|         # Only show warnings if the user explicitly wants sparse checkout but | ||||
|         # the Git version doesn't support it | ||||
|         err_old = ( | ||||
|             f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " | ||||
|             f"that doesn't fully support sparse checkout yet." | ||||
|         ) | ||||
|         err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." | ||||
|         msg.warn( | ||||
|             f"{err_unk if git_version == (0, 0) else err_old} " | ||||
|             f"This means that more files than necessary may be downloaded " | ||||
|             f"temporarily. To only download the files needed, make sure " | ||||
|             f"you're using Git v2.22 or above." | ||||
|         ) | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" | ||||
|         ret = run_command(cmd, capture=True) | ||||
|         # We need Path(name) to make sure we also support subdirectories | ||||
|         shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) | ||||
| 
 | ||||
| 
 | ||||
| def git_sparse_checkout(repo, subpath, dest, branch): | ||||
|     # We're using Git, partial clone and sparse checkout to | ||||
|     # only clone the files we need | ||||
|     # This ends up being RIDICULOUS. omg. | ||||
|  | @ -324,47 +349,31 @@ def git_checkout( | |||
|     # *that* we can do by path. | ||||
|     # We're using Git and sparse checkout to only clone the files we need | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         supports_sparse = git_version >= (2, 22) | ||||
|         use_sparse = supports_sparse and sparse | ||||
|         # This is the "clone, but don't download anything" part. | ||||
|         cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " | ||||
|         if use_sparse: | ||||
|             cmd += f"--filter=blob:none"  # <-- The key bit | ||||
|         # Only show warnings if the user explicitly wants sparse checkout but | ||||
|         # the Git version doesn't support it | ||||
|         elif sparse: | ||||
|             err_old = ( | ||||
|                 f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " | ||||
|                 f"that doesn't fully support sparse checkout yet." | ||||
|             ) | ||||
|             err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." | ||||
|             msg.warn( | ||||
|                 f"{err_unk if git_version == (0, 0) else err_old} " | ||||
|                 f"This means that more files than necessary may be downloaded " | ||||
|                 f"temporarily. To only download the files needed, make sure " | ||||
|                 f"you're using Git v2.22 or above." | ||||
|             ) | ||||
|         try_run_command(cmd) | ||||
|         cmd = ( | ||||
|             f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " | ||||
|             f"-b {branch} --filter=blob:none" | ||||
|         ) | ||||
|         run_command(cmd) | ||||
|         # Now we need to find the missing filenames for the subpath we want. | ||||
|         # Looking for this 'rev-list' command in the git --help? Hah. | ||||
|         cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}" | ||||
|         ret = try_run_command(cmd) | ||||
|         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" | ||||
|         ret = run_command(cmd, capture=True) | ||||
|         git_repo = _from_http_to_git(repo) | ||||
|         # Now pass those missings into another bit of git internals | ||||
|         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) | ||||
|         if use_sparse and not missings: | ||||
|         if not missings: | ||||
|             err = ( | ||||
|                 f"Could not find any relevant files for '{subpath}'. " | ||||
|                 f"Did you specify a correct and complete path within repo '{repo}' " | ||||
|                 f"and branch {branch}?" | ||||
|             ) | ||||
|             msg.fail(err, exits=1) | ||||
|         if use_sparse: | ||||
|             cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" | ||||
|             try_run_command(cmd) | ||||
|         cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" | ||||
|         run_command(cmd, capture=True) | ||||
|         # And finally, we can checkout our subpath | ||||
|         cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" | ||||
|         try_run_command(cmd) | ||||
|         run_command(cmd, capture=True) | ||||
|         # We need Path(name) to make sure we also support subdirectories | ||||
|         shutil.move(str(tmp_dir / Path(subpath)), str(dest)) | ||||
| 
 | ||||
|  | @ -378,7 +387,7 @@ def get_git_version( | |||
|     RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns | ||||
|         (0, 0) if the version couldn't be determined. | ||||
|     """ | ||||
|     ret = try_run_command(["git", "--version"], error=error) | ||||
|     ret = run_command("git --version", capture=True) | ||||
|     stdout = ret.stdout.strip() | ||||
|     if not stdout or not stdout.startswith("git version"): | ||||
|         return (0, 0) | ||||
|  | @ -386,23 +395,6 @@ def get_git_version( | |||
|     return (int(version[0]), int(version[1])) | ||||
| 
 | ||||
| 
 | ||||
| def try_run_command( | ||||
|     cmd: Union[str, List[str]], error: str = "Could not run command" | ||||
| ) -> subprocess.CompletedProcess: | ||||
|     """Try running a command and raise an error if it fails. | ||||
| 
 | ||||
|     cmd (Union[str, List[str]]): The command to run. | ||||
|     error (str): The error message. | ||||
|     RETURNS (CompletedProcess): The completed process if the command ran. | ||||
|     """ | ||||
|     try: | ||||
|         return run_command(cmd, capture=True) | ||||
|     except subprocess.CalledProcessError as e: | ||||
|         msg.fail(error) | ||||
|         print(cmd) | ||||
|         sys.exit(1) | ||||
| 
 | ||||
| 
 | ||||
| def _from_http_to_git(repo: str) -> str: | ||||
|     if repo.startswith("http://"): | ||||
|         repo = repo.replace(r"http://", r"https://") | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ from typing import Dict, Any, Optional | |||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam | ||||
| from thinc.api import Model, data_validation | ||||
| from thinc.api import Model, data_validation, set_gpu_allocator | ||||
| import typer | ||||
| 
 | ||||
| from ._util import Arg, Opt, debug_cli, show_validation_error | ||||
|  | @ -53,7 +53,12 @@ def debug_model_cli( | |||
|     } | ||||
|     config_overrides = parse_config_overrides(ctx.args) | ||||
|     with show_validation_error(config_path): | ||||
|         config = util.load_config(config_path, overrides=config_overrides) | ||||
|         config = util.load_config( | ||||
|             config_path, overrides=config_overrides, interpolate=True | ||||
|         ) | ||||
|         allocator = config["training"]["gpu_allocator"] | ||||
|         if use_gpu >= 0 and allocator: | ||||
|             set_gpu_allocator(allocator) | ||||
|         nlp, config = util.load_model_from_config(config_path) | ||||
|     seed = config["training"]["seed"] | ||||
|     if seed is not None: | ||||
|  |  | |||
|  | @ -110,7 +110,7 @@ def package( | |||
|     msg.good(f"Successfully created package '{model_name_v}'", main_path) | ||||
|     if create_sdist: | ||||
|         with util.working_dir(main_path): | ||||
|             util.run_command([sys.executable, "setup.py", "sdist"]) | ||||
|             util.run_command([sys.executable, "setup.py", "sdist"], capture=False) | ||||
|         zip_file = main_path / "dist" / f"{model_name_v}.tar.gz" | ||||
|         msg.good(f"Successfully created zipped Python package", zip_file) | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,10 +4,9 @@ import time | |||
| import re | ||||
| from collections import Counter | ||||
| from pathlib import Path | ||||
| from thinc.api import Config | ||||
| from thinc.api import use_pytorch_for_gpu_memory, require_gpu | ||||
| from thinc.api import require_gpu, set_gpu_allocator | ||||
| from thinc.api import set_dropout_rate, to_categorical, fix_random_seed | ||||
| from thinc.api import CosineDistance, L2Distance | ||||
| from thinc.api import Config, CosineDistance, L2Distance | ||||
| from wasabi import msg | ||||
| import srsly | ||||
| from functools import partial | ||||
|  | @ -32,7 +31,7 @@ def pretrain_cli( | |||
|     ctx: typer.Context,  # This is only used to read additional arguments | ||||
|     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), | ||||
|     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), | ||||
|     code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), | ||||
|     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), | ||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|  | @ -99,10 +98,12 @@ def pretrain( | |||
|     epoch_resume: Optional[int] = None, | ||||
|     use_gpu: int = -1, | ||||
| ): | ||||
|     if config["system"].get("seed") is not None: | ||||
|         fix_random_seed(config["system"]["seed"]) | ||||
|     if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"): | ||||
|         use_pytorch_for_gpu_memory() | ||||
|     if config["training"]["seed"] is not None: | ||||
|         fix_random_seed(config["training"]["seed"]) | ||||
|     allocator = config["training"]["gpu_allocator"] | ||||
|     if use_gpu >= 0 and allocator: | ||||
|         set_gpu_allocator(allocator) | ||||
| 
 | ||||
|     nlp, config = util.load_model_from_config(config) | ||||
|     P_cfg = config["pretraining"] | ||||
|     corpus = dot_to_object(config, P_cfg["corpus"]) | ||||
|  |  | |||
|  | @ -59,8 +59,9 @@ def project_run( | |||
|         for dep in cmd.get("deps", []): | ||||
|             if not (project_dir / dep).exists(): | ||||
|                 err = f"Missing dependency specified by command '{subcommand}': {dep}" | ||||
|                 err_help = "Maybe you forgot to run the 'project assets' command?" | ||||
|                 err_kwargs = {"exits": 1} if not dry else {} | ||||
|                 msg.fail(err, **err_kwargs) | ||||
|                 msg.fail(err, err_help, **err_kwargs) | ||||
|         with working_dir(project_dir) as current_dir: | ||||
|             rerun = check_rerun(current_dir, cmd) | ||||
|             if not rerun and not force: | ||||
|  | @ -144,7 +145,7 @@ def run_commands( | |||
|         if not silent: | ||||
|             print(f"Running command: {join_command(command)}") | ||||
|         if not dry: | ||||
|             run_command(command) | ||||
|             run_command(command, capture=False) | ||||
| 
 | ||||
| 
 | ||||
| def validate_subcommand( | ||||
|  |  | |||
|  | @ -8,7 +8,11 @@ train = "" | |||
| dev = "" | ||||
| 
 | ||||
| [system] | ||||
| use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }} | ||||
| {% if use_transformer -%} | ||||
| gpu_allocator = "pytorch" | ||||
| {% else -%} | ||||
| gpu_allocator = null | ||||
| {% endif %} | ||||
| 
 | ||||
| [nlp] | ||||
| lang = "{{ lang }}" | ||||
|  |  | |||
|  | @ -6,8 +6,7 @@ from pathlib import Path | |||
| from wasabi import msg | ||||
| import thinc | ||||
| import thinc.schedules | ||||
| from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed | ||||
| from thinc.api import Config, Optimizer | ||||
| from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator | ||||
| import random | ||||
| import typer | ||||
| import logging | ||||
|  | @ -29,7 +28,7 @@ def train_cli( | |||
|     ctx: typer.Context,  # This is only used to read additional arguments | ||||
|     config_path: Path = Arg(..., help="Path to config file", exists=True), | ||||
|     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), | ||||
|     code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|     resume: bool = Opt(False, "--resume", "-R", help="Resume training"), | ||||
|  | @ -79,15 +78,16 @@ def train( | |||
|         config = util.load_config( | ||||
|             config_path, overrides=config_overrides, interpolate=True | ||||
|         ) | ||||
|     if config.get("training", {}).get("seed") is not None: | ||||
|     if config["training"]["seed"] is not None: | ||||
|         fix_random_seed(config["training"]["seed"]) | ||||
|     if config.get("system", {}).get("use_pytorch_for_gpu_memory"): | ||||
|         # It feels kind of weird to not have a default for this. | ||||
|         use_pytorch_for_gpu_memory() | ||||
|     allocator = config["training"]["gpu_allocator"] | ||||
|     if use_gpu >= 0 and allocator: | ||||
|         set_gpu_allocator(allocator) | ||||
|     # Use original config here before it's resolved to functions | ||||
|     sourced_components = get_sourced_components(config) | ||||
|     with show_validation_error(config_path): | ||||
|         nlp, config = util.load_model_from_config(config) | ||||
|     util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) | ||||
|     if config["training"]["vectors"] is not None: | ||||
|         util.load_vectors_into_model(nlp, config["training"]["vectors"]) | ||||
|     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) | ||||
|  |  | |||
|  | @ -6,13 +6,12 @@ init_tok2vec = null | |||
| 
 | ||||
| [system] | ||||
| seed = 0 | ||||
| use_pytorch_for_gpu_memory = false | ||||
| gpu_allocator = null | ||||
| 
 | ||||
| [nlp] | ||||
| lang = null | ||||
| pipeline = [] | ||||
| disabled = [] | ||||
| load_vocab_data = true | ||||
| before_creation = null | ||||
| after_creation = null | ||||
| after_pipeline_creation = null | ||||
|  | @ -52,12 +51,14 @@ limit = 0 | |||
| # Training hyper-parameters and additional features. | ||||
| [training] | ||||
| seed = ${system.seed} | ||||
| gpu_allocator = ${system.gpu_allocator} | ||||
| dropout = 0.1 | ||||
| accumulate_gradient = 1 | ||||
| # Extra resources for transfer-learning or pseudo-rehearsal | ||||
| init_tok2vec = ${paths.init_tok2vec} | ||||
| raw_text = ${paths.raw} | ||||
| vectors = null | ||||
| lookups = null | ||||
| # Controls early-stopping. 0 or -1 mean unlimited. | ||||
| patience = 1600 | ||||
| max_epochs = 0 | ||||
|  | @ -75,7 +76,6 @@ train_corpus = "corpora.train" | |||
| [training.logger] | ||||
| @loggers = "spacy.ConsoleLogger.v1" | ||||
| 
 | ||||
| 
 | ||||
| [training.batcher] | ||||
| @batchers = "spacy.batch_by_words.v1" | ||||
| discard_oversize = false | ||||
|  |  | |||
|  | @ -31,6 +31,7 @@ from .schemas import ConfigSchema | |||
| from .git_info import GIT_VERSION | ||||
| from . import util | ||||
| from . import about | ||||
| from .lookups import load_lookups | ||||
| 
 | ||||
| 
 | ||||
| # This is the base config will all settings (training etc.) | ||||
|  | @ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: | |||
|     return tokenizer_factory | ||||
| 
 | ||||
| 
 | ||||
| @registry.misc("spacy.LookupsDataLoader.v1") | ||||
| def load_lookups_data(lang, tables): | ||||
|     util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") | ||||
|     lookups = load_lookups(lang=lang, tables=tables) | ||||
|     return lookups | ||||
| 
 | ||||
| 
 | ||||
| class Language: | ||||
|     """A text-processing pipeline. Usually you'll load this once per process, | ||||
|     and pass the instance around your application. | ||||
|  | @ -152,7 +160,6 @@ class Language: | |||
|                 self.lang, | ||||
|                 self.Defaults, | ||||
|                 vectors_name=vectors_name, | ||||
|                 load_data=self._config["nlp"]["load_vocab_data"], | ||||
|             ) | ||||
|         else: | ||||
|             if (self.lang and vocab.lang) and (self.lang != vocab.lang): | ||||
|  |  | |||
|  | @ -8,6 +8,7 @@ from collections import defaultdict | |||
| from thinc.api import Optimizer | ||||
| 
 | ||||
| from .attrs import NAMES | ||||
| from .lookups import Lookups | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     # This lets us add type hints for mypy etc. without causing circular imports | ||||
|  | @ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel): | |||
| class ConfigSchemaTraining(BaseModel): | ||||
|     # fmt: off | ||||
|     vectors: Optional[StrictStr] = Field(..., title="Path to vectors") | ||||
|     lookups: Optional[Lookups] = Field(..., title="Vocab lookups") | ||||
|     dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") | ||||
|     train_corpus: StrictStr = Field(..., title="Path in the config to the training data") | ||||
|     batcher: Batcher = Field(..., title="Batcher for the training data") | ||||
|  | @ -207,6 +209,7 @@ class ConfigSchemaTraining(BaseModel): | |||
|     max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for") | ||||
|     eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)") | ||||
|     seed: Optional[StrictInt] = Field(..., title="Random seed") | ||||
|     gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") | ||||
|     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") | ||||
|     score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") | ||||
|     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") | ||||
|  | @ -227,7 +230,6 @@ class ConfigSchemaNlp(BaseModel): | |||
|     pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") | ||||
|     disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") | ||||
|     tokenizer: Callable = Field(..., title="The tokenizer to use") | ||||
|     load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") | ||||
|     before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") | ||||
|     after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") | ||||
|     after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") | ||||
|  |  | |||
|  | @ -69,7 +69,6 @@ def test_util_dot_section(): | |||
|     [nlp] | ||||
|     lang = "en" | ||||
|     pipeline = ["textcat"] | ||||
|     load_vocab_data = false | ||||
| 
 | ||||
|     [components] | ||||
| 
 | ||||
|  | @ -95,15 +94,13 @@ def test_util_dot_section(): | |||
|     # not exclusive_classes | ||||
|     assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False | ||||
|     # Test that default values got overwritten | ||||
|     assert not en_config["nlp"]["load_vocab_data"] | ||||
|     assert nl_config["nlp"]["load_vocab_data"]  # default value True | ||||
|     assert en_config["nlp"]["pipeline"] == ["textcat"] | ||||
|     assert nl_config["nlp"]["pipeline"] == [] # default value [] | ||||
|     # Test proper functioning of 'dot_to_object' | ||||
|     with pytest.raises(KeyError): | ||||
|         dot_to_object(en_config, "nlp.pipeline.tagger") | ||||
|     with pytest.raises(KeyError): | ||||
|         dot_to_object(en_config, "nlp.unknownattribute") | ||||
|     assert not dot_to_object(en_config, "nlp.load_vocab_data") | ||||
|     assert dot_to_object(nl_config, "nlp.load_vocab_data") | ||||
|     assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -253,6 +253,14 @@ def load_vectors_into_model( | |||
|                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) | ||||
| 
 | ||||
| 
 | ||||
| def load_vocab_data_into_model( | ||||
|     nlp: "Language", *, lookups: Optional["Lookups"] = None | ||||
| ) -> None: | ||||
|     """Load vocab data.""" | ||||
|     if lookups: | ||||
|         nlp.vocab.lookups = lookups | ||||
| 
 | ||||
| 
 | ||||
| def load_model( | ||||
|     name: Union[str, Path], | ||||
|     *, | ||||
|  | @ -651,8 +659,8 @@ def join_command(command: List[str]) -> str: | |||
| def run_command( | ||||
|     command: Union[str, List[str]], | ||||
|     *, | ||||
|     capture: bool = False, | ||||
|     stdin: Optional[Any] = None, | ||||
|     capture: bool = False, | ||||
| ) -> Optional[subprocess.CompletedProcess]: | ||||
|     """Run a command on the command line as a subprocess. If the subprocess | ||||
|     returns a non-zero exit code, a system exit is performed. | ||||
|  | @ -660,33 +668,46 @@ def run_command( | |||
|     command (str / List[str]): The command. If provided as a string, the | ||||
|         string will be split using shlex.split. | ||||
|     stdin (Optional[Any]): stdin to read from or None. | ||||
|     capture (bool): Whether to capture the output. | ||||
|     capture (bool): Whether to capture the output and errors. If False, | ||||
|         the stdout and stderr will not be redirected, and if there's an error, | ||||
|         sys.exit will be called with the returncode. You should use capture=False | ||||
|         when you want to turn over execution to the command, and capture=True | ||||
|         when you want to run the command more like a function. | ||||
|     RETURNS (Optional[CompletedProcess]): The process object. | ||||
|     """ | ||||
|     if isinstance(command, str): | ||||
|         command = split_command(command) | ||||
|         cmd_list = split_command(command) | ||||
|         cmd_str = command | ||||
|     else: | ||||
|         cmd_list = command | ||||
|         cmd_str = " ".join(command) | ||||
|     try: | ||||
|         ret = subprocess.run( | ||||
|             command, | ||||
|             cmd_list, | ||||
|             env=os.environ.copy(), | ||||
|             input=stdin, | ||||
|             encoding="utf8", | ||||
|             check=True, | ||||
|             check=False, | ||||
|             stdout=subprocess.PIPE if capture else None, | ||||
|             stderr=subprocess.PIPE if capture else None, | ||||
|             stderr=subprocess.STDOUT if capture else None, | ||||
|         ) | ||||
|     except FileNotFoundError: | ||||
|         # Indicates the *command* wasn't found, it's an error before the command | ||||
|         # is run. | ||||
|         raise FileNotFoundError( | ||||
|             Errors.E970.format(str_command=" ".join(command), tool=command[0]) | ||||
|             Errors.E970.format(str_command=cmd_str, tool=cmd_list[0]) | ||||
|         ) from None | ||||
|     except subprocess.CalledProcessError as e: | ||||
|         # We don't want a duplicate traceback here so we're making sure the | ||||
|         # CalledProcessError isn't re-raised. We also print both the string | ||||
|         # message and the stderr, in case the error only has one of them. | ||||
|         print(e.stderr) | ||||
|         print(e) | ||||
|         sys.exit(1) | ||||
|     if ret.returncode != 0: | ||||
|     if ret.returncode != 0 and capture: | ||||
|         message = f"Error running command:\n\n{cmd_str}\n\n" | ||||
|         message += f"Subprocess exited with status {ret.returncode}" | ||||
|         if ret.stdout is not None: | ||||
|             message += f"\n\nProcess log (stdout and stderr):\n\n" | ||||
|             message += ret.stdout | ||||
|         error = subprocess.SubprocessError(message) | ||||
|         error.ret = ret | ||||
|         error.command = cmd_str | ||||
|         raise error | ||||
|     elif ret.returncode != 0: | ||||
|         sys.exit(ret.returncode) | ||||
|     return ret | ||||
| 
 | ||||
|  |  | |||
|  | @ -28,7 +28,7 @@ cdef class Vocab: | |||
|     cpdef readonly StringStore strings | ||||
|     cpdef public Morphology morphology | ||||
|     cpdef public object vectors | ||||
|     cpdef public object lookups | ||||
|     cpdef public object _lookups | ||||
|     cpdef public object writing_system | ||||
|     cpdef public object get_noun_chunks | ||||
|     cdef readonly int length | ||||
|  |  | |||
|  | @ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS | |||
| from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang | ||||
| 
 | ||||
| 
 | ||||
| def create_vocab(lang, defaults, vectors_name=None, load_data=True): | ||||
| def create_vocab(lang, defaults, vectors_name=None): | ||||
|     # If the spacy-lookups-data package is installed, we pre-populate the lookups | ||||
|     # with lexeme data, if available | ||||
|     if load_data: | ||||
|         tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] | ||||
|         lookups = load_lookups(lang, tables=tables, strict=False) | ||||
|     else: | ||||
|         lookups = Lookups() | ||||
|     lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} | ||||
|     # This is messy, but it's the minimal working fix to Issue #639. | ||||
|     lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) | ||||
|  | @ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True): | |||
|     lex_attrs[NORM] = util.add_lookups( | ||||
|         lex_attrs.get(NORM, LEX_ATTRS[NORM]), | ||||
|         BASE_NORMS, | ||||
|         lookups.get_table("lexeme_norm", {}), | ||||
|     ) | ||||
|     return Vocab( | ||||
|         lex_attr_getters=lex_attrs, | ||||
|         lookups=lookups, | ||||
|         writing_system=defaults.writing_system, | ||||
|         get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), | ||||
|         vectors_name=vectors_name, | ||||
|  | @ -424,6 +417,19 @@ cdef class Vocab: | |||
|             orth = self.strings.add(orth) | ||||
|         return orth in self.vectors | ||||
| 
 | ||||
|     property lookups: | ||||
|         def __get__(self): | ||||
|             return self._lookups | ||||
| 
 | ||||
|         def __set__(self, lookups): | ||||
|             self._lookups = lookups | ||||
|             if lookups.has_table("lexeme_norm"): | ||||
|                 self.lex_attr_getters[NORM] = util.add_lookups( | ||||
|                     self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), | ||||
|                     self.lookups.get_table("lexeme_norm"), | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
|     def to_disk(self, path, *, exclude=tuple()): | ||||
|         """Save the current state to a directory. | ||||
| 
 | ||||
|  |  | |||
|  | @ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides | |||
| | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                                                              | | ||||
| | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       | | ||||
| | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               | | ||||
| | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 | | ||||
| | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 | | ||||
| | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | ||||
| | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  | | ||||
|  | @ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | |||
| 
 | ||||
| | Name                    | Description                                                                                                                                                                           | | ||||
| | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              | | ||||
| | `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                           | | ||||
| | `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              | | ||||
| | `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~  | | ||||
| | `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                             | | ||||
| | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                   | | ||||
| | `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                            | | ||||
| | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                            | | ||||
| | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | | ||||
| | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                  | | ||||
|  | @ -893,8 +895,6 @@ what you need. By default, spaCy's | |||
| can provide any other repo (public or private) that you have access to using the | ||||
| `--repo` option. | ||||
| 
 | ||||
| <!-- TODO: update example once we've decided on repo structure --> | ||||
| 
 | ||||
| ```cli | ||||
| $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse] | ||||
| ``` | ||||
|  | @ -902,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse] | |||
| > #### Example | ||||
| > | ||||
| > ```cli | ||||
| > $ python -m spacy project clone some_example | ||||
| > $ python -m spacy project clone pipelines/ner_wikiner | ||||
| > ``` | ||||
| > | ||||
| > Clone from custom repo: | ||||
|  |  | |||
|  | @ -60,7 +60,6 @@ your config and check that it's valid, you can run the | |||
| > [nlp] | ||||
| > lang = "en" | ||||
| > pipeline = ["tagger", "parser", "ner"] | ||||
| > load_vocab_data = true | ||||
| > before_creation = null | ||||
| > after_creation = null | ||||
| > after_pipeline_creation = null | ||||
|  | @ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and | |||
| | `lang`                    | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~                                                                                                                                                                                        | | ||||
| | `pipeline`                | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~                                                                        | | ||||
| | `disabled`                | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | | ||||
| | `load_vocab_data`         | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~                                                                                                                                | | ||||
| | `before_creation`         | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~                                                                                                      | | ||||
| | `after_creation`          | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                                    | | ||||
| | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                   | | ||||
|  | @ -190,7 +188,9 @@ process that are used when you run [`spacy train`](/api/cli#train). | |||
| | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               | | ||||
| | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    | | ||||
| | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              | | ||||
| | `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                            | | ||||
| | `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              | | ||||
| | `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                     | | ||||
| | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              | | ||||
| | `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    | | ||||
| | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      | | ||||
|  | @ -475,7 +475,7 @@ lexical data. | |||
| Here's an example of the 20 most frequent lexemes in the English training data: | ||||
| 
 | ||||
| ```json | ||||
| %%GITHUB_SPACY / extra / example_data / vocab - data.jsonl | ||||
| %%GITHUB_SPACY/extra/example_data/vocab-data.jsonl | ||||
| ``` | ||||
| 
 | ||||
| ## Pipeline meta {#meta} | ||||
|  |  | |||
|  | @ -145,9 +145,10 @@ pipelines. | |||
| > nlp = spacy.load("en_core_web_sm") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                             | | ||||
| | ----------- | --------------------------------------- | | ||||
| | **RETURNS** | Whether the GPU was activated. ~~bool~~ | | ||||
| | Name        | Description                                      | | ||||
| | ----------- | ------------------------------------------------ | | ||||
| | `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ | | ||||
| | **RETURNS** | Whether the GPU was activated. ~~bool~~          | | ||||
| 
 | ||||
| ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"} | ||||
| 
 | ||||
|  | @ -164,9 +165,10 @@ and _before_ loading any pipelines. | |||
| > nlp = spacy.load("en_core_web_sm") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description     | | ||||
| | ----------- | --------------- | | ||||
| | **RETURNS** | `True` ~~bool~~ | | ||||
| | Name        | Description                                      | | ||||
| | ----------- | ------------------------------------------------ | | ||||
| | `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ | | ||||
| | **RETURNS** | `True` ~~bool~~                                  | | ||||
| 
 | ||||
| ## displaCy {#displacy source="spacy/displacy"} | ||||
| 
 | ||||
|  | @ -456,6 +458,16 @@ remain in the config file stored on your local system. | |||
| | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | | ||||
| | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              | | ||||
| 
 | ||||
| <Project id="integrations/wandb"> | ||||
| 
 | ||||
| Get started with tracking your spaCy training runs in Weights & Biases using our | ||||
| project template. It trains on the IMDB Movie Review Dataset and includes a | ||||
| simple config with the built-in `WandbLogger`, as well as a custom example of | ||||
| creating variants of the config for a simple hyperparameter grid search and | ||||
| logging the results. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| ## Readers {#readers source="spacy/training/corpus.py" new="3"} | ||||
| 
 | ||||
| Corpus readers are registered functions that load data and return a function | ||||
|  |  | |||
|  | @ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register | |||
| yourself. For details on how to get started with training your own model, check | ||||
| out the [training quickstart](/usage/training#quickstart). | ||||
| 
 | ||||
| <!-- TODO: | ||||
| <Project id="en_core_trf_lg"> | ||||
| <!-- TODO: <Project id="en_core_trf_lg"> | ||||
| 
 | ||||
| The easiest way to get started is to clone a transformers-based project | ||||
| template. Swap in your data, edit the settings and hyperparameters and train, | ||||
|  | @ -623,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`, | |||
| `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and | ||||
| expect the same types of objects, although for pretraining your corpus does not | ||||
| need to have any annotations, so you will often use a different reader, such as | ||||
| the [`JsonlReader`](/api/toplevel#jsonlreader). | ||||
| the [`JsonlReader`](/api/top-level#jsonlreader). | ||||
| 
 | ||||
| > #### Raw text format | ||||
| > | ||||
|  | @ -655,6 +654,16 @@ and pass in optional config overrides, like the path to the raw text file: | |||
| $ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl | ||||
| ``` | ||||
| 
 | ||||
| The following defaults are used for the `[pretraining]` block and merged into | ||||
| your existing config when you run [`init config`](/api/cli#init-config) or | ||||
| [`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed, | ||||
| you can [configure](#pretraining-configure) the settings and hyperparameters or | ||||
| change the [objective](#pretraining-details). | ||||
| 
 | ||||
| ```ini | ||||
| %%GITHUB_SPACY/spacy/default_config_pretraining.cfg | ||||
| ``` | ||||
| 
 | ||||
| ### How pretraining works {#pretraining-details} | ||||
| 
 | ||||
| The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually | ||||
|  |  | |||
|  | @ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy | |||
| right up to **current state-of-the-art**. You can also use a CPU-optimized | ||||
| pipeline, which is less accurate but much cheaper to run. | ||||
| 
 | ||||
| <!-- TODO: --> | ||||
| <!-- TODO: update benchmarks and intro --> | ||||
| 
 | ||||
| > #### Evaluation details | ||||
| > | ||||
|  | @ -68,6 +68,6 @@ our project template. | |||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| <!-- ## Citing spaCy {#citation} | ||||
| <!-- TODO: ## Citing spaCy {#citation} | ||||
| 
 | ||||
| <!-- TODO: update --> | ||||
| --> | ||||
|  |  | |||
|  | @ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible. | |||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| Note that when using a PyTorch or Tensorflow model, it is recommended to set the | ||||
| GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or | ||||
| "tensorflow" in the training config, cupy will allocate memory via those | ||||
| respective libraries, preventing OOM errors when there's available memory | ||||
| sitting in the other library's pool. | ||||
| 
 | ||||
| ```ini | ||||
| ### config.cfg (excerpt) | ||||
| [training] | ||||
| gpu_allocator = "pytorch" | ||||
| ``` | ||||
| 
 | ||||
| ## Custom models with Thinc {#thinc} | ||||
| 
 | ||||
| Of course it's also possible to define the `Model` from the previous section | ||||
|  | @ -477,7 +489,7 @@ with Model.define_operators({">>": chain}): | |||
| <Infobox title="This section is still under construction" emoji="🚧" variant="warning"> | ||||
| </Infobox> | ||||
| 
 | ||||
| <!-- TODO: | ||||
| <!-- TODO: write trainable component section | ||||
| - Interaction with `predict`, `get_loss` and `set_annotations` | ||||
| - Initialization life-cycle with `begin_training`, correlation with add_label | ||||
| Example: relation extraction component (implemented as project template) | ||||
|  |  | |||
|  | @ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on | |||
| native Python packaging. This allows your application to handle a spaCy pipeline | ||||
| like any other package dependency. | ||||
| 
 | ||||
| <!-- TODO: reference relevant spaCy project --> | ||||
| 
 | ||||
| ### Downloading and requiring package dependencies {#models-download} | ||||
| 
 | ||||
| spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a | ||||
|  |  | |||
|  | @ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new | |||
| 
 | ||||
|  | ||||
| 
 | ||||
| <!-- TODO: | ||||
| <Project id="some_example_project"> | ||||
| <Project id="pipelines/tagger_parser_ud"> | ||||
| 
 | ||||
| Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum | ||||
| sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat | ||||
| mattis pretium. | ||||
| The easiest way to get started is to clone a project template and run it – for | ||||
| example, this end-to-end template that lets you train a **part-of-speech | ||||
| tagger** and **dependency parser** on a Universal Dependencies treebank. | ||||
| 
 | ||||
| </Project> | ||||
| --> | ||||
| 
 | ||||
| spaCy projects make it easy to integrate with many other **awesome tools** in | ||||
| the data science and machine learning ecosystem to track and manage your data | ||||
|  | @ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the | |||
| project, e.g. to train a pipeline and edit the commands and scripts to build | ||||
| fully custom workflows. | ||||
| 
 | ||||
| <!-- TODO: update with real example project --> | ||||
| 
 | ||||
| ```cli | ||||
| python -m spacy project clone some_example_project | ||||
| python -m spacy project clone pipelines/tagger_parser_ud | ||||
| ``` | ||||
| 
 | ||||
| By default, the project will be cloned into the current working directory. You | ||||
|  | @ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up | |||
| a quick web demo. It looks pretty similar to a config file used to define CI | ||||
| pipelines. | ||||
| 
 | ||||
| <!-- TODO: update with better (final) example --> | ||||
| 
 | ||||
| ```yaml | ||||
| https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml | ||||
| https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml | ||||
| ``` | ||||
| 
 | ||||
| | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | | ||||
|  | @ -976,14 +970,12 @@ your results. | |||
| 
 | ||||
|  | ||||
| 
 | ||||
| <!-- TODO: | ||||
| 
 | ||||
| <Project id="integrations/wandb"> | ||||
| 
 | ||||
| Get started with tracking your spaCy training runs in Weights & Biases using our | ||||
| project template. It includes a simple config using the `WandbLogger`, as well | ||||
| as a custom logger implementation you can adjust for your specific use case. | ||||
| project template. It trains on the IMDB Movie Review Dataset and includes a | ||||
| simple config with the built-in `WandbLogger`, as well as a custom example of | ||||
| creating variants of the config for a simple hyperparameter grid search and | ||||
| logging the results. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| --> | ||||
|  |  | |||
|  | @ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data, | |||
| meta and configuration will be written out. To make the pipeline more convenient | ||||
| to deploy, we recommend wrapping it as a [Python package](/api/cli#package). | ||||
| 
 | ||||
| <Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config"> | ||||
| <Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced> | ||||
| 
 | ||||
| When you save a pipeline in spaCy v3.0+, two files will be exported: a | ||||
| [`config.cfg`](/api/data-formats#config) based on | ||||
|  | @ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta). | |||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| <Project id="pipelines/tagger_parser_ud"> | ||||
| 
 | ||||
| The easiest way to get started with an end-to-end workflow is to clone a | ||||
| [project template](/usage/projects) and run it – for example, this template that | ||||
| lets you train a **part-of-speech tagger** and **dependency parser** on a | ||||
| Universal Dependencies treebank and generates an installable Python package. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| ### Generating a pipeline package {#models-generating} | ||||
| 
 | ||||
| <Infobox title="Important note" variant="warning"> | ||||
|  | @ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead. | |||
| ```python | ||||
| nlp = spacy.blank("en").from_disk("/path/to/data") | ||||
| ``` | ||||
| 
 | ||||
| <!-- TODO: point to spaCy projects? --> | ||||
|  |  | |||
|  | @ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the | |||
| $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy | ||||
| ``` | ||||
| 
 | ||||
| <Accordion title="How are the config recommendations generated?" id="quickstart-source"> | ||||
| <Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced> | ||||
| 
 | ||||
| The recommended config settings generated by the quickstart widget and the | ||||
| [`init config`](/api/cli#init-config) command are based on some general **best | ||||
|  | @ -112,6 +112,15 @@ as we run more experiments. | |||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| <Project id="pipelines/tagger_parser_ud"> | ||||
| 
 | ||||
| The easiest way to get started is to clone a [project template](/usage/projects) | ||||
| and run it – for example, this end-to-end template that lets you train a | ||||
| **part-of-speech tagger** and **dependency parser** on a Universal Dependencies | ||||
| treebank. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| ## Training config {#config} | ||||
| 
 | ||||
| Training config files include all **settings and hyperparameters** for training | ||||
|  |  | |||
|  | @ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model. | |||
| 
 | ||||
| ### Manage end-to-end workflows with projects {#features-projects} | ||||
| 
 | ||||
| <!-- TODO: update example --> | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```cli | ||||
| > # Clone a project template | ||||
| > $ python -m spacy project clone example | ||||
| > $ cd example | ||||
| > $ python -m spacy project clone pipelines/tagger_parser_ud | ||||
| > $ cd tagger_parser_ud | ||||
| > # Download data assets | ||||
| > $ python -m spacy project assets | ||||
| > # Run a workflow | ||||
| > $ python -m spacy project run train | ||||
| > $ python -m spacy project run all | ||||
| > ``` | ||||
| 
 | ||||
| spaCy projects let you manage and share **end-to-end spaCy workflows** for | ||||
|  | @ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps, | |||
| [Ray](/usage/projects#ray) for parallel training, | ||||
| [Weights & Biases](/usage/projects#wandb) for experiment tracking, and more! | ||||
| 
 | ||||
| <!-- <Project id="some_example_project"> | ||||
| 
 | ||||
| The easiest way to get started with an end-to-end training process is to clone a | ||||
| [project](/usage/projects) template. Projects let you manage multi-step | ||||
| workflows, from data preprocessing to training and packaging your pipeline. | ||||
| 
 | ||||
| </Project>--> | ||||
| 
 | ||||
| <Infobox title="Details & Documentation" emoji="📖" list> | ||||
| 
 | ||||
| - **Usage:** [spaCy projects](/usage/projects), | ||||
|  | @ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline. | |||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| <Project id="pipelines/tagger_parser_ud"> | ||||
| 
 | ||||
| The easiest way to get started is to clone a [project template](/usage/projects) | ||||
| and run it – for example, this end-to-end template that lets you train a | ||||
| **part-of-speech tagger** and **dependency parser** on a Universal Dependencies | ||||
| treebank. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| ### Parallel and distributed training with Ray {#features-parallel-training} | ||||
| 
 | ||||
| > #### Example | ||||
|  | @ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training: | |||
| + python -m spacy train ./config.cfg --output ./output | ||||
| ``` | ||||
| 
 | ||||
| <!-- TODO: project template --> | ||||
| <Project id="pipelines/tagger_parser_ud"> | ||||
| 
 | ||||
| The easiest way to get started is to clone a [project template](/usage/projects) | ||||
| and run it – for example, this end-to-end template that lets you train a | ||||
| **part-of-speech tagger** and **dependency parser** on a Universal Dependencies | ||||
| treebank. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| #### Training via the Python API {#migrating-training-python} | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,6 +12,7 @@ | |||
|     "companyUrl": "https://explosion.ai", | ||||
|     "repo": "explosion/spaCy", | ||||
|     "modelsRepo": "explosion/spacy-models", | ||||
|     "projectsRepo": "explosion/projects/tree/v3", | ||||
|     "social": { | ||||
|         "twitter": "spacy_io", | ||||
|         "github": "explosion" | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) { | |||
|         const isValid = isString(children) && !isNaN(children) | ||||
|         const version = isValid ? Number(children).toFixed(1) : children | ||||
|         const tooltipText = `This feature is new and was introduced in spaCy v${version}` | ||||
|         // TODO: we probably want to handle this more elegantly, but the idea is
 | ||||
|         // We probably want to handle this more elegantly, but the idea is
 | ||||
|         // that we can hide tags referring to old versions
 | ||||
|         const major = isString(version) ? Number(version.split('.')[0]) : version | ||||
|         return major < MIN_VERSION ? null : ( | ||||
|  |  | |||
|  | @ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser() | |||
| const DEFAULT_BRANCH = 'develop' | ||||
| export const repo = siteMetadata.repo | ||||
| export const modelsRepo = siteMetadata.modelsRepo | ||||
| export const projectsRepo = siteMetadata.projectsRepo | ||||
| 
 | ||||
| /** | ||||
|  * This is used to provide selectors for headings so they can be crawled by | ||||
|  |  | |||
|  | @ -15,6 +15,10 @@ | |||
|     background: transparent | ||||
|     resize: none | ||||
|     font: inherit | ||||
|     overflow: hidden | ||||
|     white-space: nowrap | ||||
|     text-overflow: ellipsis | ||||
|     margin-right: 1rem | ||||
| 
 | ||||
| .prefix | ||||
|     margin-right: 0.75em | ||||
|  |  | |||
|  | @ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md' | |||
| 
 | ||||
| const CODE_EXAMPLE = `# pip install spacy
 | ||||
| # python -m spacy download en_core_web_sm | ||||
| 
 | ||||
| import spacy | ||||
| 
 | ||||
| # Load English tokenizer, tagger, parser and NER | ||||
|  | @ -120,7 +119,7 @@ const Landing = ({ data }) => { | |||
|                         </Li> | ||||
|                         <Li> | ||||
|                             ✅ Components for <strong>named entity</strong> recognition, | ||||
|                             part-of-speech-tagging, dependency parsing, sentence segmentation,{' '} | ||||
|                             part-of-speech tagging, dependency parsing, sentence segmentation,{' '} | ||||
|                             <strong>text classification</strong>, lemmatization, morphological | ||||
|                             analysis, entity linking and more | ||||
|                         </Li> | ||||
|  | @ -223,10 +222,11 @@ const Landing = ({ data }) => { | |||
|                     <br /> | ||||
|                     <br /> | ||||
|                     <br /> | ||||
|                     {/** TODO: update with actual example */} | ||||
|                     <Project id="some_example"> | ||||
|                         Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum | ||||
|                         sodales lectus. | ||||
|                     <Project id="pipelines/tagger_parser_ud" title="Get started"> | ||||
|                         The easiest way to get started is to clone a project template and run it | ||||
|                         – for example, this template for training a{' '} | ||||
|                         <strong>part-of-speech tagger</strong> and{' '} | ||||
|                         <strong>dependency parser</strong> on a Universal Dependencies treebank. | ||||
|                     </Project> | ||||
|                 </LandingCol> | ||||
|                 <LandingCol> | ||||
|  |  | |||
|  | @ -4,25 +4,29 @@ import CopyInput from '../components/copy' | |||
| import Infobox from '../components/infobox' | ||||
| import Link from '../components/link' | ||||
| import { InlineCode } from '../components/code' | ||||
| import { projectsRepo } from '../components/util' | ||||
| 
 | ||||
| // TODO: move to meta?
 | ||||
| const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3' | ||||
| const COMMAND = 'python -m spacy project clone' | ||||
| 
 | ||||
| export default function Project({ id, repo, children }) { | ||||
| export default function Project({ | ||||
|     title = 'Get started with a project template', | ||||
|     id, | ||||
|     repo, | ||||
|     children, | ||||
| }) { | ||||
|     const repoArg = repo ? ` --repo ${repo}` : '' | ||||
|     const text = `${COMMAND} ${id}${repoArg}` | ||||
|     const url = `${repo || DEFAULT_REPO}/${id}` | ||||
|     const title = ( | ||||
|     const url = `${repo || projectsRepo}/${id}` | ||||
|     const header = ( | ||||
|         <> | ||||
|             Get started with a project template:{' '} | ||||
|             {title}:{' '} | ||||
|             <Link to={url}> | ||||
|                 <InlineCode>{id}</InlineCode> | ||||
|             </Link> | ||||
|         </> | ||||
|     ) | ||||
|     return ( | ||||
|         <Infobox title={title} emoji="🪐"> | ||||
|         <Infobox title={header} emoji="🪐"> | ||||
|             {children} | ||||
|             <CopyInput text={text} prefix="$" /> | ||||
|         </Infobox> | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user