mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Improve assets and DVC handling (#5719)
* Improve assets and DVC handling * Remove outdated comment [ci skip]
This commit is contained in:
		
							parent
							
								
									a39a110c4e
								
							
						
					
					
						commit
						8cb7f9ccff
					
				|  | @ -1,4 +1,4 @@ | |||
| from typing import List, Dict, Any, Optional, Sequence | ||||
| from typing import List, Dict, Any, Optional, Sequence, Union | ||||
| import typer | ||||
| import srsly | ||||
| from pathlib import Path | ||||
|  | @ -18,7 +18,7 @@ from ..util import ensure_path, run_command, make_tempdir, working_dir | |||
| from ..util import get_hash, get_checksum, split_command | ||||
| 
 | ||||
| 
 | ||||
| CONFIG_FILE = "project.yml" | ||||
| PROJECT_FILE = "project.yml" | ||||
| DVC_CONFIG = "dvc.yaml" | ||||
| DVC_DIR = ".dvc" | ||||
| DIRS = [ | ||||
|  | @ -38,12 +38,12 @@ CACHES = [ | |||
|     os.environ.get("TORCH_HOME"), | ||||
|     Path.home() / ".keras", | ||||
| ] | ||||
| DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit | ||||
| # it directly and edit the project.yml instead and re-run the project.""" | ||||
| DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit | ||||
| # it directly and edit the {PROJECT_FILE} instead and re-run the project.""" | ||||
| CLI_HELP = f"""Command-line interface for spaCy projects and working with project | ||||
| templates. You'd typically start by cloning a project template to a local | ||||
| directory and fetching its assets like datasets etc. See the project's | ||||
| {CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data | ||||
| {PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data | ||||
| Version Control) to manage input and output files and to ensure steps are only | ||||
| re-run if their inputs change. | ||||
| """ | ||||
|  | @ -91,7 +91,7 @@ def project_init_cli( | |||
|     # fmt: off | ||||
|     path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force initiziation"), | ||||
|     force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Initialize a project directory with DVC and optionally Git. This should | ||||
|  | @ -100,7 +100,7 @@ def project_init_cli( | |||
|     be a Git repo, it should be initialized with Git first, before initializing | ||||
|     DVC. This allows DVC to integrate with Git. | ||||
|     """ | ||||
|     project_init(path, git=git, force=force, silent=True) | ||||
|     project_init(path, git=git, force=force) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("assets") | ||||
|  | @ -110,11 +110,11 @@ def project_assets_cli( | |||
|     # fmt: on | ||||
| ): | ||||
|     """Use DVC (Data Version Control) to fetch project assets. Assets are | ||||
|     defined in the "assets" section of the project config. If possible, DVC | ||||
|     defined in the "assets" section of the project.yml. If possible, DVC | ||||
|     will try to track the files so you can pull changes from upstream. It will | ||||
|     also try and store the checksum so the assets are versioned. If the file | ||||
|     can't be tracked or checked, it will be downloaded without DVC. If a checksum | ||||
|     is provided in the project config, the file is only downloaded if no local | ||||
|     is provided in the project.yml, the file is only downloaded if no local | ||||
|     file with the same checksum exists. | ||||
|     """ | ||||
|     project_assets(project_dir) | ||||
|  | @ -132,7 +132,7 @@ def project_run_all_cli( | |||
|     # fmt: on | ||||
| ): | ||||
|     """Run all commands defined in the project. This command will use DVC and | ||||
|     the defined outputs and dependencies in the project config to determine | ||||
|     the defined outputs and dependencies in the project.yml to determine | ||||
|     which steps need to be re-run and where to start. This means you're only | ||||
|     re-generating data if the inputs have changed. | ||||
| 
 | ||||
|  | @ -151,12 +151,12 @@ def project_run_all_cli( | |||
| def project_run_cli( | ||||
|     # fmt: off | ||||
|     ctx: typer.Context, | ||||
|     subcommand: str = Arg(None, help="Name of command defined in project config"), | ||||
|     subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Run a named script defined in the project config. If the command is | ||||
|     """Run a named script defined in the project.yml. If the command is | ||||
|     part of the default pipeline defined in the "run" section, DVC is used to | ||||
|     determine whether the step should re-run if its inputs have changed, or | ||||
|     whether everything is up to date. If the script is not part of the default | ||||
|  | @ -175,13 +175,13 @@ def project_run_cli( | |||
| @project_cli.command("exec", hidden=True) | ||||
| def project_exec_cli( | ||||
|     # fmt: off | ||||
|     subcommand: str = Arg(..., help="Name of command defined in project config"), | ||||
|     subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Execute a command defined in the project config. This CLI command is | ||||
|     """Execute a command defined in the project.yml. This CLI command is | ||||
|     only called internally in auto-generated DVC pipelines, as a shortcut for | ||||
|     multi-step commands in the project config. You typically shouldn't have to | ||||
|     multi-step commands in the project.yml. You typically shouldn't have to | ||||
|     call it yourself. To run a command, call "run" or "run-all". | ||||
|     """ | ||||
|     project_exec(project_dir, subcommand) | ||||
|  | @ -196,15 +196,15 @@ def project_update_dvc_cli( | |||
|     # fmt: on | ||||
| ): | ||||
|     """Update the auto-generated DVC config file. Uses the steps defined in the | ||||
|     "run" section of the project config. This typically happens automatically | ||||
|     "run" section of the project.yml. This typically happens automatically | ||||
|     when running a command, but can also be triggered manually if needed. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) | ||||
|     if updated: | ||||
|         msg.good(f"Updated DVC config from {CONFIG_FILE}") | ||||
|         msg.good(f"Updated DVC config from {PROJECT_FILE}") | ||||
|     else: | ||||
|         msg.info(f"No changes found in {CONFIG_FILE}, no update needed") | ||||
|         msg.info(f"No changes found in {PROJECT_FILE}, no update needed") | ||||
| 
 | ||||
| 
 | ||||
| app.add_typer(project_cli, name="project") | ||||
|  | @ -241,7 +241,7 @@ def project_clone( | |||
|         cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" | ||||
|         try: | ||||
|             run_command(cmd) | ||||
|         except SystemExit: | ||||
|         except DVCError: | ||||
|             err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." | ||||
|             msg.fail(err) | ||||
|         with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: | ||||
|  | @ -249,7 +249,7 @@ def project_clone( | |||
|         try: | ||||
|             run_command(["git", "-C", str(tmp_dir), "fetch"]) | ||||
|             run_command(["git", "-C", str(tmp_dir), "checkout"]) | ||||
|         except SystemExit: | ||||
|         except DVCError: | ||||
|             err = f"Could not clone '{name}' in the repo '{repo}'." | ||||
|             msg.fail(err) | ||||
|         shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) | ||||
|  | @ -282,27 +282,29 @@ def project_init( | |||
|     with working_dir(project_dir) as cwd: | ||||
|         if git: | ||||
|             run_command(["git", "init"]) | ||||
|         init_cmd = ["dvc", "init"] | ||||
|         if silent: | ||||
|             init_cmd.append("--quiet") | ||||
|         if not git: | ||||
|             init_cmd.append("--no-scm") | ||||
|         if force: | ||||
|             init_cmd.append("--force") | ||||
|         run_command(init_cmd) | ||||
|         flags = {"--force": force, "--quiet": silent, "--no-scm": not git} | ||||
|         try: | ||||
|             run_dvc_command(["init"], flags=flags) | ||||
|         except DVCError: | ||||
|             msg.fail( | ||||
|                 "Failed to initialize project. This likely means that the " | ||||
|                 "project is already initialized and has a .dvc directory. " | ||||
|                 "To force-initialize, use the --force flag.", | ||||
|                 exits=1, | ||||
|             ) | ||||
|         # We don't want to have analytics on by default – our users should | ||||
|         # opt-in explicitly. If they want it, they can always enable it. | ||||
|         if not analytics: | ||||
|             run_command(["dvc", "config", "core.analytics", "false"]) | ||||
|         # Remove unused and confusing plot templates from .dvc directory | ||||
|         # TODO: maybe we shouldn't do this, but it's otherwise super confusing | ||||
|         # once you commit your changes via Git and it creates a bunch of files | ||||
|         # that have no purpose | ||||
|             run_dvc_command(["config", "core.analytics", "false"]) | ||||
|         # Remove unused and confusing plot templates from .dvc directory. | ||||
|         # Otherwise super confusing once you commit your changes via Git and it | ||||
|         # creates a bunch of files that have no purpose. | ||||
|         plots_dir = cwd / DVC_DIR / "plots" | ||||
|         if plots_dir.exists(): | ||||
|             shutil.rmtree(str(plots_dir)) | ||||
|         config = load_project_config(cwd) | ||||
|         setup_check_dvc(cwd, config) | ||||
|     msg.good("Initialized project") | ||||
| 
 | ||||
| 
 | ||||
| def project_assets(project_dir: Path) -> None: | ||||
|  | @ -315,19 +317,33 @@ def project_assets(project_dir: Path) -> None: | |||
|     setup_check_dvc(project_path, config) | ||||
|     assets = config.get("assets", {}) | ||||
|     if not assets: | ||||
|         msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) | ||||
|         msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) | ||||
|     msg.info(f"Fetching {len(assets)} asset(s)") | ||||
|     variables = config.get("variables", {}) | ||||
|     fetched_assets = [] | ||||
|     for asset in assets: | ||||
|         url = asset["url"].format(**variables) | ||||
|         dest = asset["dest"].format(**variables) | ||||
|         fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) | ||||
|         url = asset.get("url") | ||||
|         checksum = asset.get("checksum") | ||||
|         if not url: | ||||
|             # project.yml defines asset without URL that the user has to place | ||||
|             if not Path(dest).exists(): | ||||
|                 err = f"No URL provided for asset. You need to add this file yourself: {dest}" | ||||
|                 msg.warn(err) | ||||
|             else: | ||||
|                 if checksum == get_checksum(dest): | ||||
|                     msg.good(f"Asset exists with matching checksum: {dest}") | ||||
|                     fetched_assets.append((project_path / dest).resolve()) | ||||
|                 else: | ||||
|                     msg.fail(f"Asset available but with incorrect checksum: {dest}") | ||||
|             continue | ||||
|         url = url.format(**variables) | ||||
|         fetched_path = fetch_asset(project_path, url, dest, checksum) | ||||
|         if fetched_path: | ||||
|             fetched_assets.append(str(fetched_path)) | ||||
|     if fetched_assets: | ||||
|         with working_dir(project_path): | ||||
|             run_command(["dvc", "add", *fetched_assets, "--external"]) | ||||
|             run_dvc_command(["add", *fetched_assets, "--external"]) | ||||
| 
 | ||||
| 
 | ||||
| def fetch_asset( | ||||
|  | @ -359,19 +375,17 @@ def fetch_asset( | |||
|             # Try with tracking the source first, then just downloading with | ||||
|             # DVC, then a regular non-DVC download. | ||||
|             try: | ||||
|                 dvc_cmd = ["dvc", "import-url", url, str(dest_path)] | ||||
|                 print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) | ||||
|             except subprocess.CalledProcessError: | ||||
|                 dvc_cmd = ["dvc", "get-url", url, str(dest_path)] | ||||
|                 print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) | ||||
|         except subprocess.CalledProcessError: | ||||
|                 run_dvc_command(["import-url", url, str(dest_path)]) | ||||
|             except DVCError: | ||||
|                 run_dvc_command(["get-url", url, str(dest_path)]) | ||||
|         except DVCError: | ||||
|             try: | ||||
|                 download_file(url, dest_path) | ||||
|             except requests.exceptions.HTTPError as e: | ||||
|                 msg.fail(f"Download failed: {dest}", e) | ||||
|                 return None | ||||
|     if checksum and checksum != get_checksum(dest_path): | ||||
|         msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") | ||||
|         msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") | ||||
|     msg.good(f"Fetched asset {dest}") | ||||
|     return dest_path | ||||
| 
 | ||||
|  | @ -384,13 +398,17 @@ def project_run_all(project_dir: Path, *dvc_args) -> None: | |||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     setup_check_dvc(project_dir, config) | ||||
|     dvc_cmd = ["dvc", "repro", *dvc_args] | ||||
|     with working_dir(project_dir): | ||||
|         run_command(dvc_cmd) | ||||
|         try: | ||||
|             run_dvc_command(["repro", *dvc_args]) | ||||
|         except DVCError: | ||||
|             # We could raise a custom error here, but the output produced by | ||||
|             # DVC is already pretty substantial. | ||||
|             sys.exit(1) | ||||
| 
 | ||||
| 
 | ||||
| def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: | ||||
|     """Simulate a CLI help prompt using the info available in the project config. | ||||
|     """Simulate a CLI help prompt using the info available in the project.yml. | ||||
| 
 | ||||
|     project_dir (Path): The project directory. | ||||
|     subcommand (Optional[str]): The subcommand or None. If a subcommand is | ||||
|  | @ -408,15 +426,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: | |||
|         if help_text: | ||||
|             msg.text(f"\n{help_text}\n") | ||||
|     else: | ||||
|         print(f"\nAvailable commands in {CONFIG_FILE}") | ||||
|         print(f"\nAvailable commands in {PROJECT_FILE}") | ||||
|         print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") | ||||
|         msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) | ||||
|         msg.text("Run all commands defined in the 'run' block of the project config:") | ||||
|         msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:") | ||||
|         print(f"{COMMAND} project run-all {project_dir}") | ||||
| 
 | ||||
| 
 | ||||
| def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: | ||||
|     """Run a named script defined in the project config. If the script is part | ||||
|     """Run a named script defined in the project.yml. If the script is part | ||||
|     of the default pipeline (defined in the "run" section), DVC is used to | ||||
|     execute the command, so it can determine whether to rerun it. It then | ||||
|     calls into "exec" to execute it. | ||||
|  | @ -433,9 +451,13 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: | |||
|     validate_subcommand(commands.keys(), subcommand) | ||||
|     if subcommand in config.get("run", []): | ||||
|         # This is one of the pipeline commands tracked in DVC | ||||
|         dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] | ||||
|         with working_dir(project_dir): | ||||
|             run_command(dvc_cmd) | ||||
|             try: | ||||
|                 run_dvc_command(["repro", subcommand, *dvc_args]) | ||||
|             except DVCError: | ||||
|                 # We could raise a custom error here, but the output produced by | ||||
|                 # DVC is already pretty substantial. | ||||
|                 sys.exit(1) | ||||
|     else: | ||||
|         cmd = commands[subcommand] | ||||
|         # Deps in non-DVC commands aren't tracked, but if they're defined, | ||||
|  | @ -448,8 +470,8 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: | |||
|             run_commands(cmd["script"], variables) | ||||
| 
 | ||||
| 
 | ||||
| def project_exec(project_dir: Path, subcommand: str): | ||||
|     """Execute a command defined in the project config. | ||||
| def project_exec(project_dir: Path, subcommand: str) -> None: | ||||
|     """Execute a command defined in the project.yml. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     subcommand (str): Name of command to run. | ||||
|  | @ -468,15 +490,15 @@ def project_exec(project_dir: Path, subcommand: str): | |||
| 
 | ||||
| 
 | ||||
| def load_project_config(path: Path) -> Dict[str, Any]: | ||||
|     """Load the project config file from a directory and validate it. | ||||
|     """Load the project.yml file from a directory and validate it. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     RETURNS (Dict[str, Any]): The loaded project config. | ||||
|     RETURNS (Dict[str, Any]): The loaded project.yml. | ||||
|     """ | ||||
|     config_path = path / CONFIG_FILE | ||||
|     config_path = path / PROJECT_FILE | ||||
|     if not config_path.exists(): | ||||
|         msg.fail("Can't find project config", config_path, exits=1) | ||||
|     invalid_err = f"Invalid project config in {CONFIG_FILE}" | ||||
|         msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) | ||||
|     invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." | ||||
|     try: | ||||
|         config = srsly.read_yaml(config_path) | ||||
|     except ValueError as e: | ||||
|  | @ -500,7 +522,7 @@ def update_dvc_config( | |||
|     dict, so if any of the config values change, the DVC config is regenerated. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     config (Dict[str, Any]): The loaded project config. | ||||
|     config (Dict[str, Any]): The loaded project.yml. | ||||
|     verbose (bool): Whether to print additional info (via DVC). | ||||
|     silent (bool): Don't output anything (via DVC). | ||||
|     force (bool): Force update, even if hashes match. | ||||
|  | @ -514,10 +536,10 @@ def update_dvc_config( | |||
|         with dvc_config_path.open("r", encoding="utf8") as f: | ||||
|             ref_hash = f.readline().strip().replace("# ", "") | ||||
|         if ref_hash == config_hash and not force: | ||||
|             return False  # Nothing has changed in project config, don't need to update | ||||
|             return False  # Nothing has changed in project.yml, don't need to update | ||||
|         dvc_config_path.unlink() | ||||
|     variables = config.get("variables", {}) | ||||
|     commands = [] | ||||
|     dvc_commands = [] | ||||
|     # We only want to include commands that are part of the main list of "run" | ||||
|     # commands in project.yml and should be run in sequence | ||||
|     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||
|  | @ -535,15 +557,12 @@ def update_dvc_config( | |||
|         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] | ||||
|         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] | ||||
|         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] | ||||
|         dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] | ||||
|         if verbose: | ||||
|             dvc_cmd.append("--verbose") | ||||
|         if silent: | ||||
|             dvc_cmd.append("--quiet") | ||||
|         dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] | ||||
|         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] | ||||
|         commands.append(" ".join(full_cmd)) | ||||
|         dvc_commands.append(" ".join(full_cmd)) | ||||
|     with working_dir(path): | ||||
|         run_commands(commands, variables, silent=True) | ||||
|         dvc_flags = {"--verbose": verbose, "--quiet": silent} | ||||
|         run_dvc_commands(dvc_commands, variables, flags=dvc_flags) | ||||
|     with dvc_config_path.open("r+", encoding="utf8") as f: | ||||
|         content = f.read() | ||||
|         f.seek(0, 0) | ||||
|  | @ -571,7 +590,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: | |||
|     DVC project. | ||||
| 
 | ||||
|     project_dir (Path): The path to the project directory. | ||||
|     config (Dict[str, Any]): The loaded project config. | ||||
|     config (Dict[str, Any]): The loaded project.yml. | ||||
|     """ | ||||
|     if not project_dir.exists(): | ||||
|         msg.fail(f"Can't find project directory: {project_dir}") | ||||
|  | @ -586,38 +605,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: | |||
|     with msg.loading("Updating DVC config..."): | ||||
|         updated = update_dvc_config(project_dir, config, silent=True) | ||||
|     if updated: | ||||
|         msg.good(f"Updated DVC config from changed {CONFIG_FILE}") | ||||
| 
 | ||||
| 
 | ||||
| def run_commands( | ||||
|     commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False | ||||
| ) -> None: | ||||
|     """Run a sequence of commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands. | ||||
|     variables (Dict[str, str]): Dictionary of variable names, mapped to their | ||||
|         values. Will be used to substitute format string variables in the | ||||
|         commands. | ||||
|     silent (bool): Don't print the commands. | ||||
|     """ | ||||
|     for command in commands: | ||||
|         # Substitute variables, e.g. "./{NAME}.json" | ||||
|         command = command.format(**variables) | ||||
|         command = split_command(command) | ||||
|         # Not sure if this is needed or a good idea. Motivation: users may often | ||||
|         # use commands in their config that reference "python" and we want to | ||||
|         # make sure that it's always executing the same Python that spaCy is | ||||
|         # executed with and the pip in the same env, not some other Python/pip. | ||||
|         # Also ensures cross-compatibility if user 1 writes "python3" (because | ||||
|         # that's how it's set up on their system), and user 2 without the | ||||
|         # shortcut tries to re-run the command. | ||||
|         if len(command) and command[0] in ("python", "python3"): | ||||
|             command[0] = sys.executable | ||||
|         elif len(command) and command[0] in ("pip", "pip3"): | ||||
|             command = [sys.executable, "-m", "pip", *command[1:]] | ||||
|         if not silent: | ||||
|             print(f"Running command: {' '.join(command)}") | ||||
|         run_command(command) | ||||
|         msg.good(f"Updated DVC config from changed {PROJECT_FILE}") | ||||
| 
 | ||||
| 
 | ||||
| def convert_asset_url(url: str) -> str: | ||||
|  | @ -627,7 +615,7 @@ def convert_asset_url(url: str) -> str: | |||
|     RETURNS (str): The converted URL. | ||||
|     """ | ||||
|     # If the asset URL is a regular GitHub URL it's likely a mistake | ||||
|     if re.match("(http(s?)):\/\/github.com", url): | ||||
|     if re.match(r"(http(s?)):\/\/github.com", url): | ||||
|         converted = url.replace("github.com", "raw.githubusercontent.com") | ||||
|         converted = re.sub(r"/(tree|blob)/", "/", converted) | ||||
|         msg.warn( | ||||
|  | @ -679,7 +667,7 @@ def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: | |||
|     """ | ||||
|     if subcommand not in commands: | ||||
|         msg.fail( | ||||
|             f"Can't find command '{subcommand}' in {CONFIG_FILE}. " | ||||
|             f"Can't find command '{subcommand}' in {PROJECT_FILE}. " | ||||
|             f"Available commands: {', '.join(commands)}", | ||||
|             exits=1, | ||||
|         ) | ||||
|  | @ -706,3 +694,112 @@ def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: | |||
|         for data in response.iter_content(chunk_size=chunk_size): | ||||
|             size = f.write(data) | ||||
|             bar.update(size) | ||||
| 
 | ||||
| 
 | ||||
| def run_commands( | ||||
|     commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False | ||||
| ) -> None: | ||||
|     """Run a sequence of commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands. | ||||
|     variables (Dict[str, str]): Dictionary of variable names, mapped to their | ||||
|         values. Will be used to substitute format string variables in the | ||||
|         commands. | ||||
|     silent (bool): Don't print the commands. | ||||
|     """ | ||||
|     for command in commands: | ||||
|         # Substitute variables, e.g. "./{NAME}.json" | ||||
|         command = command.format(**variables) | ||||
|         command = split_command(command) | ||||
|         # Not sure if this is needed or a good idea. Motivation: users may often | ||||
|         # use commands in their config that reference "python" and we want to | ||||
|         # make sure that it's always executing the same Python that spaCy is | ||||
|         # executed with and the pip in the same env, not some other Python/pip. | ||||
|         # Also ensures cross-compatibility if user 1 writes "python3" (because | ||||
|         # that's how it's set up on their system), and user 2 without the | ||||
|         # shortcut tries to re-run the command. | ||||
|         if len(command) and command[0] in ("python", "python3"): | ||||
|             command[0] = sys.executable | ||||
|         elif len(command) and command[0] in ("pip", "pip3"): | ||||
|             command = [sys.executable, "-m", "pip", *command[1:]] | ||||
|         if not silent: | ||||
|             print(f"Running command: {' '.join(command)}") | ||||
|         run_command(command) | ||||
| 
 | ||||
| 
 | ||||
| def run_dvc_commands( | ||||
|     commands: List[str] = tuple(), | ||||
|     variables: Dict[str, str] = {}, | ||||
|     flags: Dict[str, bool] = {}, | ||||
| ) -> None: | ||||
|     """Run a sequence of DVC commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands without the leading "dvc". | ||||
|     variables (Dict[str, str]): Dictionary of variable names, mapped to their | ||||
|         values. Will be used to substitute format string variables in the | ||||
|         commands. | ||||
|     flags (Dict[str, bool]): Conditional flags to be added to command. Makes it | ||||
|         easier to pass flags like --quiet that depend on a variable or | ||||
|         command-line setting while avoiding lots of nested conditionals. | ||||
|     """ | ||||
|     for command in commands: | ||||
|         # Substitute variables, e.g. "./{NAME}.json" | ||||
|         command = command.format(**variables) | ||||
|         command = split_command(command) | ||||
|         run_dvc_command(command, flags=flags) | ||||
| 
 | ||||
| 
 | ||||
| def run_dvc_command( | ||||
|     command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False | ||||
| ) -> None: | ||||
|     """Run a DVC command in a subprocess. This wrapper gives us a bit more | ||||
|     control over how the output and errors are presented. Raises a DVC error if | ||||
|     the "dvc" command returns a non-zero exit code and uses the error message | ||||
|     logged by DVC. | ||||
| 
 | ||||
|     command (Union[str, List[str]]): The command, without the leading "dvc". | ||||
|     flags (Dict[str, bool]): Conditional flags to be added to command. Makes it | ||||
|         easier to pass flags like --quiet that depend on a variable or | ||||
|         command-line setting while avoiding lots of nested conditionals. | ||||
|     silent (bool): Don't print any output. | ||||
|     """ | ||||
|     if isinstance(command, str): | ||||
|         command = split_command(command) | ||||
|     dvc_command = ["dvc", *command] | ||||
|     # Add the flags if they are set to True | ||||
|     for flag, is_active in flags.items(): | ||||
|         if is_active: | ||||
|             dvc_command.append(flag) | ||||
|     proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE) | ||||
|     if not silent: | ||||
|         lines = proc.stdout.read().decode("utf8").split("\n\n") | ||||
|         for line in lines: | ||||
|             line = line.strip() | ||||
|             if is_relevant_dvc_output(line): | ||||
|                 print(f"{line}\n") | ||||
|     _, err = proc.communicate()  # Important: otherwise returncode will be None! | ||||
|     if proc.returncode != 0: | ||||
|         if isinstance(err, bytes): | ||||
|             err = err.decode("utf8") | ||||
|         raise DVCError(err) | ||||
| 
 | ||||
| 
 | ||||
| def is_relevant_dvc_output(line: str) -> bool: | ||||
|     """Check whether the output by DVC is something we want to keep. | ||||
| 
 | ||||
|     line (str): A line written to stdout,. | ||||
|     RETURNS (bool): Whether to use/print the line. | ||||
|     """ | ||||
|     # Writing them like this for readability but maybe replace with regex? | ||||
|     conditions = [ | ||||
|         not line, | ||||
|         line.startswith("What's next?"), | ||||
|         line.startswith("Having any troubles?"), | ||||
|     ] | ||||
|     return not any(conditions) | ||||
| 
 | ||||
| 
 | ||||
| class DVCError(RuntimeError): | ||||
|     """Custom error type for anything produced by the DVC CLI.""" | ||||
| 
 | ||||
|     pass | ||||
|  |  | |||
|  | @ -222,7 +222,7 @@ class TrainingSchema(BaseModel): | |||
| class ProjectConfigAsset(BaseModel): | ||||
|     # fmt: off | ||||
|     dest: StrictStr = Field(..., title="Destination of downloaded asset") | ||||
|     url: StrictStr = Field(..., title="URL of asset") | ||||
|     url: Optional[StrictStr] = Field(None, title="URL of asset") | ||||
|     checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") | ||||
|     # fmt: on | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user