mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Merge branch 'feature/project-cli' into develop
This commit is contained in:
		
						commit
						bac8a8d766
					
				|  | @ -15,6 +15,7 @@ from .evaluate import evaluate  # noqa: F401 | ||||||
| from .convert import convert  # noqa: F401 | from .convert import convert  # noqa: F401 | ||||||
| from .init_model import init_model  # noqa: F401 | from .init_model import init_model  # noqa: F401 | ||||||
| from .validate import validate  # noqa: F401 | from .validate import validate  # noqa: F401 | ||||||
|  | from .project import project_clone, project_assets, project_run  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| from typing import Optional |  | ||||||
| import typer | import typer | ||||||
| from typer.main import get_command | from typer.main import get_command | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,7 +1,9 @@ | ||||||
| from typing import Optional, List | from typing import Optional, List, Dict | ||||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||||
| from wasabi import Printer | from wasabi import Printer | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | import re | ||||||
|  | import srsly | ||||||
| 
 | 
 | ||||||
| from ..gold import Corpus | from ..gold import Corpus | ||||||
| from ..tokens import Doc | from ..tokens import Doc | ||||||
|  | @ -16,13 +18,12 @@ def evaluate_cli( | ||||||
|     # fmt: off |     # fmt: off | ||||||
|     model: str = Arg(..., help="Model name or path"), |     model: str = Arg(..., help="Model name or path"), | ||||||
|     data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), |     data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), | ||||||
|  |     output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), | ||||||
|     gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), |     gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), | ||||||
|     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), |     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), | ||||||
|     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), |     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), | ||||||
|     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), |     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), | ||||||
|     return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"), |     # fmt: on | ||||||
| 
 |  | ||||||
|         # fmt: on |  | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Evaluate a model. To render a sample of parses in a HTML file, set an |     Evaluate a model. To render a sample of parses in a HTML file, set an | ||||||
|  | @ -31,24 +32,24 @@ def evaluate_cli( | ||||||
|     evaluate( |     evaluate( | ||||||
|         model, |         model, | ||||||
|         data_path, |         data_path, | ||||||
|  |         output=output, | ||||||
|         gpu_id=gpu_id, |         gpu_id=gpu_id, | ||||||
|         gold_preproc=gold_preproc, |         gold_preproc=gold_preproc, | ||||||
|         displacy_path=displacy_path, |         displacy_path=displacy_path, | ||||||
|         displacy_limit=displacy_limit, |         displacy_limit=displacy_limit, | ||||||
|         silent=False, |         silent=False, | ||||||
|         return_scores=return_scores, |  | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def evaluate( | def evaluate( | ||||||
|     model: str, |     model: str, | ||||||
|     data_path: Path, |     data_path: Path, | ||||||
|  |     output: Optional[Path], | ||||||
|     gpu_id: int = -1, |     gpu_id: int = -1, | ||||||
|     gold_preproc: bool = False, |     gold_preproc: bool = False, | ||||||
|     displacy_path: Optional[Path] = None, |     displacy_path: Optional[Path] = None, | ||||||
|     displacy_limit: int = 25, |     displacy_limit: int = 25, | ||||||
|     silent: bool = True, |     silent: bool = True, | ||||||
|     return_scores: bool = False, |  | ||||||
| ) -> Scorer: | ) -> Scorer: | ||||||
|     msg = Printer(no_print=silent, pretty=not silent) |     msg = Printer(no_print=silent, pretty=not silent) | ||||||
|     util.fix_random_seed() |     util.fix_random_seed() | ||||||
|  | @ -56,21 +57,19 @@ def evaluate( | ||||||
|         util.use_gpu(gpu_id) |         util.use_gpu(gpu_id) | ||||||
|     util.set_env_log(False) |     util.set_env_log(False) | ||||||
|     data_path = util.ensure_path(data_path) |     data_path = util.ensure_path(data_path) | ||||||
|  |     output_path = util.ensure_path(output) | ||||||
|     displacy_path = util.ensure_path(displacy_path) |     displacy_path = util.ensure_path(displacy_path) | ||||||
|     if not data_path.exists(): |     if not data_path.exists(): | ||||||
|         msg.fail("Evaluation data not found", data_path, exits=1) |         msg.fail("Evaluation data not found", data_path, exits=1) | ||||||
|     if displacy_path and not displacy_path.exists(): |     if displacy_path and not displacy_path.exists(): | ||||||
|         msg.fail("Visualization output directory not found", displacy_path, exits=1) |         msg.fail("Visualization output directory not found", displacy_path, exits=1) | ||||||
|     corpus = Corpus(data_path, data_path) |     corpus = Corpus(data_path, data_path) | ||||||
|     if model.startswith("blank:"): |     nlp = util.load_model(model) | ||||||
|         nlp = util.get_lang_class(model.replace("blank:", ""))() |  | ||||||
|     else: |  | ||||||
|         nlp = util.load_model(model) |  | ||||||
|     dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) |     dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) | ||||||
|     begin = timer() |     begin = timer() | ||||||
|     scorer = nlp.evaluate(dev_dataset, verbose=False) |     scorer = nlp.evaluate(dev_dataset, verbose=False) | ||||||
|     end = timer() |     end = timer() | ||||||
|     nwords = sum(len(ex.doc) for ex in dev_dataset) |     nwords = sum(len(ex.predicted) for ex in dev_dataset) | ||||||
|     results = { |     results = { | ||||||
|         "Time": f"{end - begin:.2f} s", |         "Time": f"{end - begin:.2f} s", | ||||||
|         "Words": nwords, |         "Words": nwords, | ||||||
|  | @ -90,10 +89,22 @@ def evaluate( | ||||||
|         "Sent R": f"{scorer.sent_r:.2f}", |         "Sent R": f"{scorer.sent_r:.2f}", | ||||||
|         "Sent F": f"{scorer.sent_f:.2f}", |         "Sent F": f"{scorer.sent_f:.2f}", | ||||||
|     } |     } | ||||||
|  |     data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} | ||||||
|  | 
 | ||||||
|     msg.table(results, title="Results") |     msg.table(results, title="Results") | ||||||
| 
 | 
 | ||||||
|  |     if scorer.ents_per_type: | ||||||
|  |         data["ents_per_type"] = scorer.ents_per_type | ||||||
|  |         print_ents_per_type(msg, scorer.ents_per_type) | ||||||
|  |     if scorer.textcats_f_per_cat: | ||||||
|  |         data["textcats_f_per_cat"] = scorer.textcats_f_per_cat | ||||||
|  |         print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat) | ||||||
|  |     if scorer.textcats_auc_per_cat: | ||||||
|  |         data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat | ||||||
|  |         print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat) | ||||||
|  | 
 | ||||||
|     if displacy_path: |     if displacy_path: | ||||||
|         docs = [ex.doc for ex in dev_dataset] |         docs = [ex.predicted for ex in dev_dataset] | ||||||
|         render_deps = "parser" in nlp.meta.get("pipeline", []) |         render_deps = "parser" in nlp.meta.get("pipeline", []) | ||||||
|         render_ents = "ner" in nlp.meta.get("pipeline", []) |         render_ents = "ner" in nlp.meta.get("pipeline", []) | ||||||
|         render_parses( |         render_parses( | ||||||
|  | @ -105,8 +116,11 @@ def evaluate( | ||||||
|             ents=render_ents, |             ents=render_ents, | ||||||
|         ) |         ) | ||||||
|         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) |         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) | ||||||
|     if return_scores: | 
 | ||||||
|         return scorer.scores |     if output_path is not None: | ||||||
|  |         srsly.write_json(output_path, data) | ||||||
|  |         msg.good(f"Saved results to {output_path}") | ||||||
|  |     return data | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def render_parses( | def render_parses( | ||||||
|  | @ -128,3 +142,40 @@ def render_parses( | ||||||
|         ) |         ) | ||||||
|         with (output_path / "parses.html").open("w", encoding="utf8") as file_: |         with (output_path / "parses.html").open("w", encoding="utf8") as file_: | ||||||
|             file_.write(html) |             file_.write(html) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: | ||||||
|  |     data = [ | ||||||
|  |         (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}") | ||||||
|  |         for k, v in scores.items() | ||||||
|  |     ] | ||||||
|  |     msg.table( | ||||||
|  |         data, | ||||||
|  |         header=("", "P", "R", "F"), | ||||||
|  |         aligns=("l", "r", "r", "r"), | ||||||
|  |         title="NER (per type)", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: | ||||||
|  |     data = [ | ||||||
|  |         (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}") | ||||||
|  |         for k, v in scores.items() | ||||||
|  |     ] | ||||||
|  |     msg.table( | ||||||
|  |         data, | ||||||
|  |         header=("", "P", "R", "F"), | ||||||
|  |         aligns=("l", "r", "r", "r"), | ||||||
|  |         title="Textcat F (per type)", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def print_textcats_auc_per_cat( | ||||||
|  |     msg: Printer, scores: Dict[str, Dict[str, float]] | ||||||
|  | ) -> None: | ||||||
|  |     msg.table( | ||||||
|  |         [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()], | ||||||
|  |         header=("", "ROC AUC"), | ||||||
|  |         aligns=("l", "r"), | ||||||
|  |         title="Textcat ROC AUC (per label)", | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  | @ -16,8 +16,9 @@ def package_cli( | ||||||
|     # fmt: off |     # fmt: off | ||||||
|     input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), |     input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), | ||||||
|     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), |     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), | ||||||
|     meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False), |     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), | ||||||
|     create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), |     create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), | ||||||
|  |     version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), | ||||||
|     force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), |     force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ): | ): | ||||||
|  | @ -32,6 +33,7 @@ def package_cli( | ||||||
|         input_dir, |         input_dir, | ||||||
|         output_dir, |         output_dir, | ||||||
|         meta_path=meta_path, |         meta_path=meta_path, | ||||||
|  |         version=version, | ||||||
|         create_meta=create_meta, |         create_meta=create_meta, | ||||||
|         force=force, |         force=force, | ||||||
|         silent=False, |         silent=False, | ||||||
|  | @ -42,6 +44,7 @@ def package( | ||||||
|     input_dir: Path, |     input_dir: Path, | ||||||
|     output_dir: Path, |     output_dir: Path, | ||||||
|     meta_path: Optional[Path] = None, |     meta_path: Optional[Path] = None, | ||||||
|  |     version: Optional[str] = None, | ||||||
|     create_meta: bool = False, |     create_meta: bool = False, | ||||||
|     force: bool = False, |     force: bool = False, | ||||||
|     silent: bool = True, |     silent: bool = True, | ||||||
|  | @ -61,10 +64,13 @@ def package( | ||||||
|     if not meta_path.exists() or not meta_path.is_file(): |     if not meta_path.exists() or not meta_path.is_file(): | ||||||
|         msg.fail("Can't load model meta.json", meta_path, exits=1) |         msg.fail("Can't load model meta.json", meta_path, exits=1) | ||||||
|     meta = srsly.read_json(meta_path) |     meta = srsly.read_json(meta_path) | ||||||
|  |     meta = get_meta(input_dir, meta) | ||||||
|  |     if version is not None: | ||||||
|  |         meta["version"] = version | ||||||
|     if not create_meta:  # only print if user doesn't want to overwrite |     if not create_meta:  # only print if user doesn't want to overwrite | ||||||
|         msg.good("Loaded meta.json from file", meta_path) |         msg.good("Loaded meta.json from file", meta_path) | ||||||
|     else: |     else: | ||||||
|         meta = generate_meta(input_dir, meta, msg) |         meta = generate_meta(meta, msg) | ||||||
|     errors = validate(ModelMetaSchema, meta) |     errors = validate(ModelMetaSchema, meta) | ||||||
|     if errors: |     if errors: | ||||||
|         msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) |         msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) | ||||||
|  | @ -101,20 +107,20 @@ def create_file(file_path: Path, contents: str) -> None: | ||||||
|     file_path.open("w", encoding="utf-8").write(contents) |     file_path.open("w", encoding="utf-8").write(contents) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def generate_meta( | def get_meta( | ||||||
|     model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer |     model_path: Union[str, Path], existing_meta: Dict[str, Any] | ||||||
| ) -> Dict[str, Any]: | ) -> Dict[str, Any]: | ||||||
|     meta = existing_meta or {} |     meta = { | ||||||
|     settings = [ |         "lang": "en", | ||||||
|         ("lang", "Model language", meta.get("lang", "en")), |         "name": "model", | ||||||
|         ("name", "Model name", meta.get("name", "model")), |         "version": "0.0.0", | ||||||
|         ("version", "Model version", meta.get("version", "0.0.0")), |         "description": None, | ||||||
|         ("description", "Model description", meta.get("description", False)), |         "author": None, | ||||||
|         ("author", "Author", meta.get("author", False)), |         "email": None, | ||||||
|         ("email", "Author email", meta.get("email", False)), |         "url": None, | ||||||
|         ("url", "Author website", meta.get("url", False)), |         "license": "MIT", | ||||||
|         ("license", "License", meta.get("license", "MIT")), |     } | ||||||
|     ] |     meta.update(existing_meta) | ||||||
|     nlp = util.load_model_from_path(Path(model_path)) |     nlp = util.load_model_from_path(Path(model_path)) | ||||||
|     meta["spacy_version"] = util.get_model_version_range(about.__version__) |     meta["spacy_version"] = util.get_model_version_range(about.__version__) | ||||||
|     meta["pipeline"] = nlp.pipe_names |     meta["pipeline"] = nlp.pipe_names | ||||||
|  | @ -124,6 +130,23 @@ def generate_meta( | ||||||
|         "keys": nlp.vocab.vectors.n_keys, |         "keys": nlp.vocab.vectors.n_keys, | ||||||
|         "name": nlp.vocab.vectors.name, |         "name": nlp.vocab.vectors.name, | ||||||
|     } |     } | ||||||
|  |     if about.__title__ != "spacy": | ||||||
|  |         meta["parent_package"] = about.__title__ | ||||||
|  |     return meta | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]: | ||||||
|  |     meta = existing_meta or {} | ||||||
|  |     settings = [ | ||||||
|  |         ("lang", "Model language", meta.get("lang", "en")), | ||||||
|  |         ("name", "Model name", meta.get("name", "model")), | ||||||
|  |         ("version", "Model version", meta.get("version", "0.0.0")), | ||||||
|  |         ("description", "Model description", meta.get("description", None)), | ||||||
|  |         ("author", "Author", meta.get("author", None)), | ||||||
|  |         ("email", "Author email", meta.get("email", None)), | ||||||
|  |         ("url", "Author website", meta.get("url", None)), | ||||||
|  |         ("license", "License", meta.get("license", "MIT")), | ||||||
|  |     ] | ||||||
|     msg.divider("Generating meta.json") |     msg.divider("Generating meta.json") | ||||||
|     msg.text( |     msg.text( | ||||||
|         "Enter the package settings for your model. The following information " |         "Enter the package settings for your model. The following information " | ||||||
|  | @ -132,8 +155,6 @@ def generate_meta( | ||||||
|     for setting, desc, default in settings: |     for setting, desc, default in settings: | ||||||
|         response = get_raw_input(desc, default) |         response = get_raw_input(desc, default) | ||||||
|         meta[setting] = default if response == "" and default else response |         meta[setting] = default if response == "" and default else response | ||||||
|     if about.__title__ != "spacy": |  | ||||||
|         meta["parent_package"] = about.__title__ |  | ||||||
|     return meta |     return meta | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -184,12 +205,12 @@ def setup_package(): | ||||||
| 
 | 
 | ||||||
|     setup( |     setup( | ||||||
|         name=model_name, |         name=model_name, | ||||||
|         description=meta['description'], |         description=meta.get('description'), | ||||||
|         author=meta['author'], |         author=meta.get('author'), | ||||||
|         author_email=meta['email'], |         author_email=meta.get('email'), | ||||||
|         url=meta['url'], |         url=meta.get('url'), | ||||||
|         version=meta['version'], |         version=meta['version'], | ||||||
|         license=meta['license'], |         license=meta.get('license'), | ||||||
|         packages=[model_name], |         packages=[model_name], | ||||||
|         package_data={model_name: list_files(model_dir)}, |         package_data={model_name: list_files(model_dir)}, | ||||||
|         install_requires=list_requirements(meta), |         install_requires=list_requirements(meta), | ||||||
|  |  | ||||||
							
								
								
									
										657
									
								
								spacy/cli/project.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										657
									
								
								spacy/cli/project.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,657 @@ | ||||||
|  | from typing import List, Dict, Any, Optional | ||||||
|  | import typer | ||||||
|  | import srsly | ||||||
|  | from pathlib import Path | ||||||
|  | from wasabi import msg | ||||||
|  | import subprocess | ||||||
|  | import shlex | ||||||
|  | import os | ||||||
|  | import re | ||||||
|  | import shutil | ||||||
|  | import sys | ||||||
|  | import requests | ||||||
|  | import tqdm | ||||||
|  | 
 | ||||||
|  | from ._app import app, Arg, Opt, COMMAND, NAME | ||||||
|  | from .. import about | ||||||
|  | from ..schemas import ProjectConfigSchema, validate | ||||||
|  | from ..util import ensure_path, run_command, make_tempdir, working_dir | ||||||
|  | from ..util import get_hash, get_checksum | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | CONFIG_FILE = "project.yml" | ||||||
|  | DVC_CONFIG = "dvc.yaml" | ||||||
|  | DIRS = [ | ||||||
|  |     "assets", | ||||||
|  |     "metas", | ||||||
|  |     "configs", | ||||||
|  |     "packages", | ||||||
|  |     "metrics", | ||||||
|  |     "scripts", | ||||||
|  |     "notebooks", | ||||||
|  |     "training", | ||||||
|  |     "corpus", | ||||||
|  | ] | ||||||
|  | CACHES = [ | ||||||
|  |     Path.home() / ".torch", | ||||||
|  |     Path.home() / ".caches" / "torch", | ||||||
|  |     os.environ.get("TORCH_HOME"), | ||||||
|  |     Path.home() / ".keras", | ||||||
|  | ] | ||||||
|  | DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit | ||||||
|  | # it directly and edit the project.yml instead and re-run the project.""" | ||||||
|  | CLI_HELP = f"""Command-line interface for spaCy projects and working with project | ||||||
|  | templates. You'd typically start by cloning a project template to a local | ||||||
|  | directory and fetching its assets like datasets etc. See the project's | ||||||
|  | {CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data | ||||||
|  | Version Control) to manage input and output files and to ensure steps are only | ||||||
|  | re-run if their inputs change. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | project_cli = typer.Typer(help=CLI_HELP) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.callback(invoke_without_command=True) | ||||||
|  | def callback(ctx: typer.Context): | ||||||
|  |     """This runs before every project command and ensures DVC is installed.""" | ||||||
|  |     ensure_dvc() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ################ | ||||||
|  | # CLI COMMANDS # | ||||||
|  | ################ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command("clone") | ||||||
|  | def project_clone_cli( | ||||||
|  |     # fmt: off | ||||||
|  |     name: str = Arg(..., help="The name of the template to fetch"), | ||||||
|  |     dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), | ||||||
|  |     repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), | ||||||
|  |     git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), | ||||||
|  |     no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), | ||||||
|  |     # fmt: on | ||||||
|  | ): | ||||||
|  |     """Clone a project template from a repository. Calls into "git" and will | ||||||
|  |     only download the files from the given subdirectory. The GitHub repo | ||||||
|  |     defaults to the official spaCy template repo, but can be customized | ||||||
|  |     (including using a private repo). Setting the --git flag will also | ||||||
|  |     initialize the project directory as a Git repo. If the project is intended | ||||||
|  |     to be a Git repo, it should be initialized with Git first, before | ||||||
|  |     initializing DVC (Data Version Control). This allows DVC to integrate with | ||||||
|  |     Git. | ||||||
|  |     """ | ||||||
|  |     project_clone(name, dest, repo=repo, git=git, no_init=no_init) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command("init") | ||||||
|  | def project_init_cli( | ||||||
|  |     path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), | ||||||
|  |     git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), | ||||||
|  | ): | ||||||
|  |     """Initialize a project directory with DVC and optionally Git. This should | ||||||
|  |     typically be taken care of automatically when you run the "project clone" | ||||||
|  |     command, but you can also run it separately. If the project is intended to | ||||||
|  |     be a Git repo, it should be initialized with Git first, before initializing | ||||||
|  |     DVC. This allows DVC to integrate with Git. | ||||||
|  |     """ | ||||||
|  |     project_init(path, git=git, silent=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command("assets") | ||||||
|  | def project_assets_cli( | ||||||
|  |     # fmt: off | ||||||
|  |     project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), | ||||||
|  |     # fmt: on | ||||||
|  | ): | ||||||
|  |     """Use DVC (Data Version Control) to fetch project assets. Assets are | ||||||
|  |     defined in the "assets" section of the project config. If possible, DVC | ||||||
|  |     will try to track the files so you can pull changes from upstream. It will | ||||||
|  |     also try and store the checksum so the assets are versioned. If th file | ||||||
|  |     can't be tracked or checked, it will be downloaded without DVC. If a checksum | ||||||
|  |     is provided in the project config, the file is only downloaded if no local | ||||||
|  |     file with the same checksum exists. | ||||||
|  |     """ | ||||||
|  |     project_assets(project_dir) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command( | ||||||
|  |     "run-all", | ||||||
|  |     context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, | ||||||
|  | ) | ||||||
|  | def project_run_all_cli( | ||||||
|  |     # fmt: off | ||||||
|  |     ctx: typer.Context, | ||||||
|  |     project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), | ||||||
|  |     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||||
|  |     # fmt: on | ||||||
|  | ): | ||||||
|  |     """Run all commands defined in the project. This command will use DVC and | ||||||
|  |     the defined outputs and dependencies in the project config to determine | ||||||
|  |     which steps need to be re-run and where to start. This means you're only | ||||||
|  |     re-generating data if the inputs have changed. | ||||||
|  | 
 | ||||||
|  |     This command calls into "dvc repro" and all additional arguments are passed | ||||||
|  |     to the "dvc repro" command: https://dvc.org/doc/command-reference/repro | ||||||
|  |     """ | ||||||
|  |     if show_help: | ||||||
|  |         print_run_help(project_dir) | ||||||
|  |     else: | ||||||
|  |         project_run_all(project_dir, *ctx.args) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command( | ||||||
|  |     "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, | ||||||
|  | ) | ||||||
|  | def project_run_cli( | ||||||
|  |     # fmt: off | ||||||
|  |     ctx: typer.Context, | ||||||
|  |     project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), | ||||||
|  |     subcommand: str = Arg(None, help="Name of command defined in project config"), | ||||||
|  |     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||||
|  |     # fmt: on | ||||||
|  | ): | ||||||
|  |     """Run a named script defined in the project config. If the command is | ||||||
|  |     part of the default pipeline defined in the "run" section, DVC is used to | ||||||
|  |     determine whether the step should re-run if its inputs have changed, or | ||||||
|  |     whether everything is up to date. If the script is not part of the default | ||||||
|  |     pipeline, it will be called separately without DVC. | ||||||
|  | 
 | ||||||
|  |     If DVC is used, the command calls into "dvc repro" and all additional | ||||||
|  |     arguments are passed to the "dvc repro" command: | ||||||
|  |     https://dvc.org/doc/command-reference/repro | ||||||
|  |     """ | ||||||
|  |     if show_help or not subcommand: | ||||||
|  |         print_run_help(project_dir, subcommand) | ||||||
|  |     else: | ||||||
|  |         project_run(project_dir, subcommand, *ctx.args) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command("exec", hidden=True) | ||||||
|  | def project_exec_cli( | ||||||
|  |     # fmt: off | ||||||
|  |     project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), | ||||||
|  |     subcommand: str = Arg(..., help="Name of command defined in project config"), | ||||||
|  |     # fmt: on | ||||||
|  | ): | ||||||
|  |     """Execute a command defined in the project config. This CLI command is | ||||||
|  |     only called internally in auto-generated DVC pipelines, as a shortcut for | ||||||
|  |     multi-step commands in the project config. You typically shouldn't have to | ||||||
|  |     call it yourself. To run a command, call "run" or "run-all". | ||||||
|  |     """ | ||||||
|  |     project_exec(project_dir, subcommand) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @project_cli.command("update-dvc") | ||||||
|  | def project_update_dvc_cli( | ||||||
|  |     # fmt: off | ||||||
|  |     project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), | ||||||
|  |     verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), | ||||||
|  |     force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), | ||||||
|  |     # fmt: on | ||||||
|  | ): | ||||||
|  |     """Update the auto-generated DVC config file. Uses the steps defined in the | ||||||
|  |     "run" section of the project config. This typically happens automatically | ||||||
|  |     when running a command, but can also be triggered manually if needed. | ||||||
|  |     """ | ||||||
|  |     config = load_project_config(project_dir) | ||||||
|  |     updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) | ||||||
|  |     if updated: | ||||||
|  |         msg.good(f"Updated DVC config from {CONFIG_FILE}") | ||||||
|  |     else: | ||||||
|  |         msg.info(f"No changes found in {CONFIG_FILE}, no update needed") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | app.add_typer(project_cli, name="project") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ################# | ||||||
|  | # CLI FUNCTIONS # | ||||||
|  | ################# | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def project_clone( | ||||||
|  |     name: str, | ||||||
|  |     dest: Path, | ||||||
|  |     *, | ||||||
|  |     repo: str = about.__projects__, | ||||||
|  |     git: bool = False, | ||||||
|  |     no_init: bool = False, | ||||||
|  | ) -> None: | ||||||
|  |     """Clone a project template from a repository. | ||||||
|  | 
 | ||||||
|  |     name (str): Name of subdirectory to clone. | ||||||
|  |     dest (Path): Destination path of cloned project. | ||||||
|  |     repo (str): URL of Git repo containing project templates. | ||||||
|  |     git (bool): Initialize project as Git repo. Should be set to True if project | ||||||
|  |         is intended as a repo, since it will allow DVC to integrate with Git. | ||||||
|  |     no_init (bool): Don't initialize DVC and Git automatically. If True, the | ||||||
|  |         "init" command or "git init" and "dvc init" need to be run manually. | ||||||
|  |     """ | ||||||
|  |     dest = ensure_path(dest) | ||||||
|  |     check_clone(name, dest, repo) | ||||||
|  |     project_dir = dest.resolve() | ||||||
|  |     # We're using Git and sparse checkout to only clone the files we need | ||||||
|  |     with make_tempdir() as tmp_dir: | ||||||
|  |         cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" | ||||||
|  |         run_command(shlex.split(cmd)) | ||||||
|  |         with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: | ||||||
|  |             f.write(name) | ||||||
|  |         run_command(["git", "-C", tmp_dir, "fetch"]) | ||||||
|  |         run_command(["git", "-C", tmp_dir, "checkout"]) | ||||||
|  |         shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) | ||||||
|  |     msg.good(f"Cloned project '{name}' from {repo}") | ||||||
|  |     for sub_dir in DIRS: | ||||||
|  |         dir_path = project_dir / sub_dir | ||||||
|  |         if not dir_path.exists(): | ||||||
|  |             dir_path.mkdir(parents=True) | ||||||
|  |     if not no_init: | ||||||
|  |         project_init(project_dir, git=git, silent=True) | ||||||
|  |     msg.good(f"Your project is now ready!", dest) | ||||||
|  |     print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def project_init( | ||||||
|  |     project_dir: Path, | ||||||
|  |     *, | ||||||
|  |     git: bool = False, | ||||||
|  |     silent: bool = False, | ||||||
|  |     analytics: bool = False, | ||||||
|  | ): | ||||||
|  |     """Initialize a project as a DVC and (optionally) as a Git repo. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): Path to project directory. | ||||||
|  |     git (bool): Also call "git init" to initialize directory as a Git repo. | ||||||
|  |     silent (bool): Don't print any output (via DVC). | ||||||
|  |     analytics (bool): Opt-in to DVC analytics (defaults to False). | ||||||
|  |     """ | ||||||
|  |     with working_dir(project_dir): | ||||||
|  |         init_cmd = ["dvc", "init"] | ||||||
|  |         if silent: | ||||||
|  |             init_cmd.append("--quiet") | ||||||
|  |         if not git: | ||||||
|  |             init_cmd.append("--no-scm") | ||||||
|  |         if git: | ||||||
|  |             run_command(["git", "init"]) | ||||||
|  |         run_command(init_cmd) | ||||||
|  |         # We don't want to have analytics on by default – our users should | ||||||
|  |         # opt-in explicitly. If they want it, they can always enable it. | ||||||
|  |         if not analytics: | ||||||
|  |             run_command(["dvc", "config", "core.analytics", "false"]) | ||||||
|  |         config = load_project_config(project_dir) | ||||||
|  |         setup_check_dvc(project_dir, config) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def project_assets(project_dir: Path) -> None: | ||||||
|  |     """Fetch assets for a project using DVC if possible. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): Path to project directory. | ||||||
|  |     """ | ||||||
|  |     project_path = ensure_path(project_dir) | ||||||
|  |     config = load_project_config(project_path) | ||||||
|  |     setup_check_dvc(project_path, config) | ||||||
|  |     assets = config.get("assets", {}) | ||||||
|  |     if not assets: | ||||||
|  |         msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) | ||||||
|  |     msg.info(f"Fetching {len(assets)} asset(s)") | ||||||
|  |     variables = config.get("variables", {}) | ||||||
|  |     for asset in assets: | ||||||
|  |         url = asset["url"].format(**variables) | ||||||
|  |         dest = asset["dest"].format(**variables) | ||||||
|  |         fetch_asset(project_path, url, dest, asset.get("checksum")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_asset( | ||||||
|  |     project_path: Path, url: str, dest: Path, checksum: Optional[str] = None | ||||||
|  | ) -> None: | ||||||
|  |     """Fetch an asset from a given URL or path. Will try to import the file | ||||||
|  |     using DVC's import-url if possible (fully tracked and versioned) and falls | ||||||
|  |     back to get-url (versioned) and a non-DVC download if necessary. If a | ||||||
|  |     checksum is provided and a local file exists, it's only re-downloaded if the | ||||||
|  |     checksum doesn't match. | ||||||
|  | 
 | ||||||
|  |     project_path (Path): Path to project directory. | ||||||
|  |     url (str): URL or path to asset. | ||||||
|  |     checksum (Optional[str]): Optional expected checksum of local file. | ||||||
|  |     """ | ||||||
|  |     url = convert_asset_url(url) | ||||||
|  |     dest_path = (project_path / dest).resolve() | ||||||
|  |     if dest_path.exists() and checksum: | ||||||
|  |         # If there's already a file, check for checksum | ||||||
|  |         # TODO: add support for caches (dvc import-url with local path) | ||||||
|  |         if checksum == get_checksum(dest_path): | ||||||
|  |             msg.good(f"Skipping download with matching checksum: {dest}") | ||||||
|  |             return | ||||||
|  |     dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"] | ||||||
|  |     with working_dir(project_path): | ||||||
|  |         try: | ||||||
|  |             # If these fail, we don't want to output an error or info message. | ||||||
|  |             # Try with tracking the source first, then just downloading with | ||||||
|  |             # DVC, then a regular non-DVC download. | ||||||
|  |             try: | ||||||
|  |                 dvc_cmd = ["dvc", "import-url", url, str(dest_path)] | ||||||
|  |                 print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) | ||||||
|  |             except subprocess.CalledProcessError: | ||||||
|  |                 dvc_cmd = ["dvc", "get-url", url, str(dest_path)] | ||||||
|  |                 print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) | ||||||
|  |                 run_command(dvc_add_cmd) | ||||||
|  |         except subprocess.CalledProcessError: | ||||||
|  |             try: | ||||||
|  |                 download_file(url, dest_path) | ||||||
|  |             except requests.exceptions.HTTPError as e: | ||||||
|  |                 msg.fail(f"Download failed: {dest}", e) | ||||||
|  |             run_command(dvc_add_cmd) | ||||||
|  |     if checksum and checksum != get_checksum(dest_path): | ||||||
|  |         msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") | ||||||
|  |     msg.good(f"Fetched asset {dest}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def project_run_all(project_dir: Path, *dvc_args) -> None: | ||||||
|  |     """Run all commands defined in the project using DVC. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): Path to project directory. | ||||||
|  |     *dvc_args: Other arguments passed to "dvc repro". | ||||||
|  |     """ | ||||||
|  |     config = load_project_config(project_dir) | ||||||
|  |     setup_check_dvc(project_dir, config) | ||||||
|  |     dvc_cmd = ["dvc", "repro", *dvc_args] | ||||||
|  |     with working_dir(project_dir): | ||||||
|  |         run_command(dvc_cmd) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: | ||||||
|  |     """Simulate a CLI help prompt using the info available in the project config. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): The project directory. | ||||||
|  |     subcommand (Optional[str]): The subcommand or None. If a subcommand is | ||||||
|  |         provided, the subcommand help is shown. Otherwise, the top-level help | ||||||
|  |         and a list of available commands is printed. | ||||||
|  |     """ | ||||||
|  |     config = load_project_config(project_dir) | ||||||
|  |     setup_check_dvc(project_dir, config) | ||||||
|  |     config_commands = config.get("commands", []) | ||||||
|  |     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||||
|  |     if subcommand: | ||||||
|  |         if subcommand not in commands: | ||||||
|  |             msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) | ||||||
|  |         print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") | ||||||
|  |         help_text = commands[subcommand].get("help") | ||||||
|  |         if help_text: | ||||||
|  |             msg.text(f"\n{help_text}\n") | ||||||
|  |     else: | ||||||
|  |         print(f"\nAvailable commands in {CONFIG_FILE}") | ||||||
|  |         print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]") | ||||||
|  |         msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) | ||||||
|  |         msg.text("Run all commands defined in the 'run' block of the project config:") | ||||||
|  |         print(f"{COMMAND} project run-all {project_dir}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: | ||||||
|  |     """Run a named script defined in the project config. If the script is part | ||||||
|  |     of the default pipeline (defined in the "run" section), DVC is used to | ||||||
|  |     execute the command, so it can determine whether to rerun it. It then | ||||||
|  |     calls into "exec" to execute it. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): Path to project directory. | ||||||
|  |     subcommand (str): Name of command to run. | ||||||
|  |     *dvc_args: Other arguments passed to "dvc repro". | ||||||
|  |     """ | ||||||
|  |     config = load_project_config(project_dir) | ||||||
|  |     setup_check_dvc(project_dir, config) | ||||||
|  |     config_commands = config.get("commands", []) | ||||||
|  |     variables = config.get("variables", {}) | ||||||
|  |     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||||
|  |     if subcommand not in commands: | ||||||
|  |         msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) | ||||||
|  |     if subcommand in config.get("run", []): | ||||||
|  |         # This is one of the pipeline commands tracked in DVC | ||||||
|  |         dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] | ||||||
|  |         with working_dir(project_dir): | ||||||
|  |             run_command(dvc_cmd) | ||||||
|  |     else: | ||||||
|  |         cmd = commands[subcommand] | ||||||
|  |         # Deps in non-DVC commands aren't tracked, but if they're defined, | ||||||
|  |         # make sure they exist before running the command | ||||||
|  |         for dep in cmd.get("deps", []): | ||||||
|  |             if not (project_dir / dep).exists(): | ||||||
|  |                 err = f"Missing dependency specified by command '{subcommand}': {dep}" | ||||||
|  |                 msg.fail(err, exits=1) | ||||||
|  |         with working_dir(project_dir): | ||||||
|  |             run_commands(cmd["script"], variables) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def project_exec(project_dir: Path, subcommand: str): | ||||||
|  |     """Execute a command defined in the project config. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): Path to project directory. | ||||||
|  |     subcommand (str): Name of command to run. | ||||||
|  |     """ | ||||||
|  |     config = load_project_config(project_dir) | ||||||
|  |     config_commands = config.get("commands", []) | ||||||
|  |     variables = config.get("variables", {}) | ||||||
|  |     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||||
|  |     with working_dir(project_dir): | ||||||
|  |         run_commands(commands[subcommand]["script"], variables) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ########### | ||||||
|  | # HELPERS # | ||||||
|  | ########### | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def load_project_config(path: Path) -> Dict[str, Any]: | ||||||
|  |     """Load the project config file from a directory and validate it. | ||||||
|  | 
 | ||||||
|  |     path (Path): The path to the project directory. | ||||||
|  |     RETURNS (Dict[str, Any]): The loaded project config. | ||||||
|  |     """ | ||||||
|  |     config_path = path / CONFIG_FILE | ||||||
|  |     if not config_path.exists(): | ||||||
|  |         msg.fail("Can't find project config", config_path, exits=1) | ||||||
|  |     config = srsly.read_yaml(config_path) | ||||||
|  |     errors = validate(ProjectConfigSchema, config) | ||||||
|  |     if errors: | ||||||
|  |         msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) | ||||||
|  |     return config | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def update_dvc_config( | ||||||
|  |     path: Path, | ||||||
|  |     config: Dict[str, Any], | ||||||
|  |     verbose: bool = False, | ||||||
|  |     silent: bool = False, | ||||||
|  |     force: bool = False, | ||||||
|  | ) -> bool: | ||||||
|  |     """Re-run the DVC commands in dry mode and update dvc.yaml file in the | ||||||
|  |     project directory. The file is auto-generated based on the config. The | ||||||
|  |     first line of the auto-generated file specifies the hash of the config | ||||||
|  |     dict, so if any of the config values change, the DVC config is regenerated. | ||||||
|  | 
 | ||||||
|  |     path (Path): The path to the project directory. | ||||||
|  |     config (Dict[str, Any]): The loaded project config. | ||||||
|  |     verbose (bool): Whether to print additional info (via DVC). | ||||||
|  |     silent (bool): Don't output anything (via DVC). | ||||||
|  |     force (bool): Force update, even if hashes match. | ||||||
|  |     RETURNS (bool): Whether the DVC config file was updated. | ||||||
|  |     """ | ||||||
|  |     config_hash = get_hash(config) | ||||||
|  |     path = path.resolve() | ||||||
|  |     dvc_config_path = path / DVC_CONFIG | ||||||
|  |     if dvc_config_path.exists(): | ||||||
|  |         # Cneck if the file was generated using the current config, if not, redo | ||||||
|  |         with dvc_config_path.open("r", encoding="utf8") as f: | ||||||
|  |             ref_hash = f.readline().strip().replace("# ", "") | ||||||
|  |         if ref_hash == config_hash and not force: | ||||||
|  |             return False  # Nothing has changed in project config, don't need to update | ||||||
|  |         dvc_config_path.unlink() | ||||||
|  |     variables = config.get("variables", {}) | ||||||
|  |     commands = [] | ||||||
|  |     # We only want to include commands that are part of the main list of "run" | ||||||
|  |     # commands in project.yml and should be run in sequence | ||||||
|  |     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||||
|  |     for name in config.get("run", []): | ||||||
|  |         if name not in config_commands: | ||||||
|  |             msg.fail(f"Can't find command '{name}' in project config", exits=1) | ||||||
|  |         command = config_commands[name] | ||||||
|  |         deps = command.get("deps", []) | ||||||
|  |         outputs = command.get("outputs", []) | ||||||
|  |         outputs_no_cache = command.get("outputs_no_cache", []) | ||||||
|  |         if not deps and not outputs and not outputs_no_cache: | ||||||
|  |             continue | ||||||
|  |         # Default to "." as the project path since dvc.yaml is auto-generated | ||||||
|  |         # and we don't want arbitrary paths in there | ||||||
|  |         project_cmd = ["python", "-m", NAME, "project", "exec", ".", name] | ||||||
|  |         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] | ||||||
|  |         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] | ||||||
|  |         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] | ||||||
|  |         dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] | ||||||
|  |         if verbose: | ||||||
|  |             dvc_cmd.append("--verbose") | ||||||
|  |         if silent: | ||||||
|  |             dvc_cmd.append("--quiet") | ||||||
|  |         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] | ||||||
|  |         commands.append(" ".join(full_cmd)) | ||||||
|  |     with working_dir(path): | ||||||
|  |         run_commands(commands, variables, silent=True) | ||||||
|  |     with dvc_config_path.open("r+", encoding="utf8") as f: | ||||||
|  |         content = f.read() | ||||||
|  |         f.seek(0, 0) | ||||||
|  |         f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") | ||||||
|  |     return True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def ensure_dvc() -> None: | ||||||
|  |     """Ensure that the "dvc" command is available and show an error if not.""" | ||||||
|  |     try: | ||||||
|  |         subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) | ||||||
|  |     except Exception: | ||||||
|  |         msg.fail( | ||||||
|  |             "spaCy projects require DVC (Data Version Control) and the 'dvc' command", | ||||||
|  |             "You can install the Python package from pip (pip install dvc) or " | ||||||
|  |             "conda (conda install -c conda-forge dvc). For more details, see the " | ||||||
|  |             "documentation: https://dvc.org/doc/install", | ||||||
|  |             exits=1, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: | ||||||
|  |     """Check that the project is set up correctly with DVC and update its | ||||||
|  |     config if needed. Will raise an error if the project is not an initialized | ||||||
|  |     DVC project. | ||||||
|  | 
 | ||||||
|  |     project_dir (Path): The path to the project directory. | ||||||
|  |     config (Dict[str, Any]): The loaded project config. | ||||||
|  |     """ | ||||||
|  |     if not project_dir.exists(): | ||||||
|  |         msg.fail(f"Can't find project directory: {project_dir}") | ||||||
|  |     if not (project_dir / ".dvc").exists(): | ||||||
|  |         msg.fail( | ||||||
|  |             "Project not initialized as a DVC project.", | ||||||
|  |             f"Make sure that the project template was cloned correctly. To " | ||||||
|  |             f"initialize the project directory manually, you can run: " | ||||||
|  |             f"{COMMAND} project init {project_dir}", | ||||||
|  |             exits=1, | ||||||
|  |         ) | ||||||
|  |     with msg.loading("Updating DVC config..."): | ||||||
|  |         updated = update_dvc_config(project_dir, config, silent=True) | ||||||
|  |     if updated: | ||||||
|  |         msg.good(f"Updated DVC config from changed {CONFIG_FILE}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def run_commands( | ||||||
|  |     commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False | ||||||
|  | ) -> None: | ||||||
|  |     """Run a sequence of commands in a subprocess, in order. | ||||||
|  | 
 | ||||||
|  |     commands (List[str]): The split commands. | ||||||
|  |     variables (Dict[str, str]): Dictionary of variable names, mapped to their | ||||||
|  |         values. Will be used to substitute format string variables in the | ||||||
|  |         commands. | ||||||
|  |     silent (boll): Don't print the commands. | ||||||
|  |     """ | ||||||
|  |     for command in commands: | ||||||
|  |         # Substitute variables, e.g. "./{NAME}.json" | ||||||
|  |         command = command.format(**variables) | ||||||
|  |         command = shlex.split(command) | ||||||
|  |         # TODO: is this needed / a good idea? | ||||||
|  |         if len(command) and command[0] == "python": | ||||||
|  |             command[0] = sys.executable | ||||||
|  |         elif len(command) and command[0] == "pip": | ||||||
|  |             command = [sys.executable, "-m", "pip", *command[1:]] | ||||||
|  |         if not silent: | ||||||
|  |             print(" ".join(command)) | ||||||
|  |         run_command(command) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def convert_asset_url(url: str) -> str: | ||||||
|  |     """Check and convert the asset URL if needed. | ||||||
|  | 
 | ||||||
|  |     url (str): The asset URL. | ||||||
|  |     RETURNS (str): The converted URL. | ||||||
|  |     """ | ||||||
|  |     # If the asset URL is a regular GitHub URL it's likely a mistake | ||||||
|  |     if re.match("(http(s?)):\/\/github.com", url): | ||||||
|  |         converted = url.replace("github.com", "raw.githubusercontent.com") | ||||||
|  |         converted = re.sub(r"/(tree|blob)/", "/", converted) | ||||||
|  |         msg.warn( | ||||||
|  |             "Downloading from a regular GitHub URL. This will only download " | ||||||
|  |             "the source of the page, not the actual file. Converting the URL " | ||||||
|  |             "to a raw URL.", | ||||||
|  |             converted, | ||||||
|  |         ) | ||||||
|  |         return converted | ||||||
|  |     return url | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def check_clone(name: str, dest: Path, repo: str) -> None: | ||||||
|  |     """Check and validate that the destination path can be used to clone. Will | ||||||
|  |     check that Git is available and that the destination path is suitable. | ||||||
|  | 
 | ||||||
|  |     name (str): Name of the directory to clone from the repo. | ||||||
|  |     dest (Path): Local destination of cloned directory. | ||||||
|  |     repo (str): URL of the repo to clone from. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) | ||||||
|  |     except Exception: | ||||||
|  |         msg.fail( | ||||||
|  |             f"Cloning spaCy project templates requires Git and the 'git' command. ", | ||||||
|  |             f"To clone a project without Git, copy the files from the '{name}' " | ||||||
|  |             f"directory in the {repo} to {dest} manually and then run:", | ||||||
|  |             f"{COMMAND} project init {dest}", | ||||||
|  |             exits=1, | ||||||
|  |         ) | ||||||
|  |     if not dest: | ||||||
|  |         msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) | ||||||
|  |     if dest.exists(): | ||||||
|  |         # Directory already exists (not allowed, clone needs to create it) | ||||||
|  |         msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) | ||||||
|  |     if not dest.parent.exists(): | ||||||
|  |         # We're not creating parents, parent dir should exist | ||||||
|  |         msg.fail( | ||||||
|  |             f"Can't clone project, parent directory doesn't exist: {dest.parent}", | ||||||
|  |             exits=1, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: | ||||||
|  |     """Download a file using requests. | ||||||
|  | 
 | ||||||
|  |     url (str): The URL of the file. | ||||||
|  |     dest (Path): The destination path. | ||||||
|  |     chunk_size (int): The size of chunks to read/write. | ||||||
|  |     """ | ||||||
|  |     response = requests.get(url, stream=True) | ||||||
|  |     response.raise_for_status() | ||||||
|  |     total = int(response.headers.get("content-length", 0)) | ||||||
|  |     progress_settings = { | ||||||
|  |         "total": total, | ||||||
|  |         "unit": "iB", | ||||||
|  |         "unit_scale": True, | ||||||
|  |         "unit_divisor": chunk_size, | ||||||
|  |         "leave": False, | ||||||
|  |     } | ||||||
|  |     with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: | ||||||
|  |         for data in response.iter_content(chunk_size=chunk_size): | ||||||
|  |             size = f.write(data) | ||||||
|  |             bar.update(size) | ||||||
|  | @ -220,8 +220,11 @@ class TrainingSchema(BaseModel): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ProjectConfigAsset(BaseModel): | class ProjectConfigAsset(BaseModel): | ||||||
|  |     # fmt: off | ||||||
|     dest: StrictStr = Field(..., title="Destination of downloaded asset") |     dest: StrictStr = Field(..., title="Destination of downloaded asset") | ||||||
|     url: StrictStr = Field(..., title="URL of asset") |     url: StrictStr = Field(..., title="URL of asset") | ||||||
|  |     checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") | ||||||
|  |     # fmt: on | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ProjectConfigCommand(BaseModel): | class ProjectConfigCommand(BaseModel): | ||||||
|  | @ -229,11 +232,15 @@ class ProjectConfigCommand(BaseModel): | ||||||
|     name: StrictStr = Field(..., title="Name of command") |     name: StrictStr = Field(..., title="Name of command") | ||||||
|     help: Optional[StrictStr] = Field(None, title="Command description") |     help: Optional[StrictStr] = Field(None, title="Command description") | ||||||
|     script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") |     script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") | ||||||
|     dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies") |     deps: List[StrictStr] = Field([], title="Data Version Control dependencies") | ||||||
|     dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs") |     outputs: List[StrictStr] = Field([], title="Data Version Control outputs") | ||||||
|     dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") |     outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") | ||||||
|     # fmt: on |     # fmt: on | ||||||
| 
 | 
 | ||||||
|  |     class Config: | ||||||
|  |         title = "A single named command specified in a project config" | ||||||
|  |         extra = "forbid" | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class ProjectConfigSchema(BaseModel): | class ProjectConfigSchema(BaseModel): | ||||||
|     # fmt: off |     # fmt: off | ||||||
|  |  | ||||||
|  | @ -1,15 +1,14 @@ | ||||||
| import numpy | import numpy | ||||||
| import tempfile | import tempfile | ||||||
| import shutil |  | ||||||
| import contextlib | import contextlib | ||||||
| import srsly | import srsly | ||||||
| from pathlib import Path |  | ||||||
| 
 | 
 | ||||||
| from spacy import Errors | from spacy import Errors | ||||||
| from spacy.tokens import Doc, Span | from spacy.tokens import Doc, Span | ||||||
| from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH | from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH | ||||||
| 
 | 
 | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
|  | from spacy.util import make_tempdir  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @contextlib.contextmanager | @contextlib.contextmanager | ||||||
|  | @ -19,13 +18,6 @@ def make_tempfile(mode="r"): | ||||||
|     f.close() |     f.close() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @contextlib.contextmanager |  | ||||||
| def make_tempdir(): |  | ||||||
|     d = Path(tempfile.mkdtemp()) |  | ||||||
|     yield d |  | ||||||
|     shutil.rmtree(str(d)) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_doc( | def get_doc( | ||||||
|     vocab, |     vocab, | ||||||
|     words=[], |     words=[], | ||||||
|  |  | ||||||
|  | @ -19,6 +19,9 @@ from packaging.specifiers import SpecifierSet, InvalidSpecifier | ||||||
| from packaging.version import Version, InvalidVersion | from packaging.version import Version, InvalidVersion | ||||||
| import subprocess | import subprocess | ||||||
| from contextlib import contextmanager | from contextlib import contextmanager | ||||||
|  | import tempfile | ||||||
|  | import shutil | ||||||
|  | import hashlib | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|  | @ -455,6 +458,37 @@ def working_dir(path: Union[str, Path]) -> None: | ||||||
|         os.chdir(prev_cwd) |         os.chdir(prev_cwd) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @contextmanager | ||||||
|  | def make_tempdir(): | ||||||
|  |     """Execute a block in a temporary directory and remove the directory and | ||||||
|  |     its contents at the end of the with block. | ||||||
|  | 
 | ||||||
|  |     YIELDS (Path): The path of the temp directory. | ||||||
|  |     """ | ||||||
|  |     d = Path(tempfile.mkdtemp()) | ||||||
|  |     yield d | ||||||
|  |     shutil.rmtree(str(d)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_hash(data) -> str: | ||||||
|  |     """Get the hash for a JSON-serializable object. | ||||||
|  | 
 | ||||||
|  |     data: The data to hash. | ||||||
|  |     RETURNS (str): The hash. | ||||||
|  |     """ | ||||||
|  |     data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") | ||||||
|  |     return hashlib.md5(data_str).hexdigest() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_checksum(path: Union[Path, str]) -> str: | ||||||
|  |     """Get the checksum for a file given its file path. | ||||||
|  | 
 | ||||||
|  |     path (Union[Path, str]): The file path. | ||||||
|  |     RETURNS (str): The checksum. | ||||||
|  |     """ | ||||||
|  |     return hashlib.md5(Path(path).read_bytes()).hexdigest() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def is_in_jupyter(): | def is_in_jupyter(): | ||||||
|     """Check if user is running spaCy from a Jupyter notebook by detecting the |     """Check if user is running spaCy from a Jupyter notebook by detecting the | ||||||
|     IPython kernel. Mainly used for the displaCy visualizer. |     IPython kernel. Mainly used for the displaCy visualizer. | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user